Source Code of org.apache.pdfbox.pdfparser.NonSequentialPDFParser

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;




import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.KeyStore;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.Map.Entry;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.persistence.util.COSObjectKey;


/**
 * PDFParser which first reads startxref and xref tables in order to know valid
 * objects and parse only these objects. Thus it is closer to a conforming parser
 * than the sequential reading of {@link PDFParser}.
 * 
 * This class can be used as a {@link PDFParser} replacement. First {@link #parse()}
 * must be called before page objects can be retrieved, e.g. {@link #getPDDocument()}.
 * 
 * This class is a much enhanced version of <code>QuickParser</code> presented in 
 * <a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a>
 * by Jeremy Villalobos.
 */
public class NonSequentialPDFParser extends PDFParser
{


    public static final String SYSPROP_PARSEMINIMAL = 
        "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
    public static final String SYSPROP_EOFLOOKUPRANGE = 
        "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";
        
    private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream( new byte[0] );
    
    private static final int    DEFAULT_TRAIL_BYTECOUNT = 2048;
    private static final char[] EOF_MARKER              = new char[] { '%','%','E','O','F' };
    private static final char[] STARTXREF_MARKER        = new char[] { 's','t','a','r','t','x','r','e','f' };
    private static final char[] OBJ_MARKER              = new char[] { 'o','b','j' };
    
    private final File pdfFile;
    private final RandomAccessBufferedFileInputStream raStream;
    
    private SecurityHandler securityHandler = null;
    
    private String keyStoreFilename = null;
    private String alias            = null;
    private String password         = "";
    private int    readTrailBytes   = DEFAULT_TRAIL_BYTECOUNT;    // how many trailing bytes to read for EOF marker
    
    /** If <code>true</code> object references in catalog are not followed;
     *  pro: page objects will be only parsed when needed; cons: some information of catalog
     *  might not be available (e.g. outline).
     *  Catalog parsing without pages is not an option since a number of entries will
     *  also refer to page objects (like OpenAction).
     */
    private boolean parseMinimalCatalog = "true".equals( System.getProperty( SYSPROP_PARSEMINIMAL ) );
    
    private boolean initialParseDone = false;
    private boolean allPagesParsed   = false;
        
    private static final Log LOG = LogFactory.getLog( NonSequentialPDFParser.class );
    
    // ------------------------------------------------------------------------
    /** 
     * Constructs parser for given file using memory buffer. 
     * 
     * @param filename the filename of the pdf to be parsed
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser( String filename ) throws IOException
    {
        this( new File( filename ), null );
    }
    
    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     *  
     * @throws IOException If something went wrong.
     */
    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     *  
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser( File file, RandomAccess raBuf ) throws IOException
    {
        this(file, raBuf, "");
    }
    
    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     *  
     * @throws IOException If something went wrong.
     */
    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * @param decryptionPassword password to be used for decryption
     *  
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser( File file, RandomAccess raBuf, String decryptionPassword ) throws IOException
    {
        super( EMPTY_INPUT_STREAM, null, false );
            
        String eofLookupRangeStr = System.getProperty( SYSPROP_EOFLOOKUPRANGE );
        if ( eofLookupRangeStr != null )
        {
            try
            {
                setEOFLookupRange( Integer.parseInt( eofLookupRangeStr ) );
            } 
            catch ( NumberFormatException nfe )
            {
                LOG.warn( "System property " + SYSPROP_EOFLOOKUPRANGE + 
                        " does not contain an integer value, but: '" + eofLookupRangeStr + "'" );
            }
        }
    
        pdfFile = file;
        raStream = new RandomAccessBufferedFileInputStream( pdfFile );
    
        setDocument( ( raBuf == null ) ? new COSDocument( new RandomAccessBuffer(), false ) :
                                         new COSDocument( raBuf, false ) );
    
        pdfSource = new PushBackInputStream( raStream, 4096 );
        
        password = decryptionPassword;
    }
        
    // ------------------------------------------------------------------------
    /** 
     *  Sets how many trailing bytes of PDF file are searched for
     *  EOF marker and 'startxref' marker.
     *  If not set we use default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
     *  
     *  <p<We check that new value is at least 16. However for practical use
     *  cases this value should not be lower than 1000; even 2000
     *  was found to not be enough in some cases where some trailing
     *  garbage like HTML snippets followed the EOF marker.</p>
     *  
     *  <p>In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined
     *  this value will be set on initialization but can be overwritten later.</p>
     *  
     *  @param byteCount number of trailing bytes
     */
    public void setEOFLookupRange( int byteCount )
    {
            if ( byteCount > 15 )
            {
                readTrailBytes = byteCount;
            }
    }
        
    // ------------------------------------------------------------------------
    /**
     * The initial parse will first parse only the trailer, the xrefstart and 
     * all xref tables to have a pointer (offset) to all the pdf's objects.
     * It can handle linearized pdfs, which will have an xref at the 
     * end pointing to an xref at the beginning of the file.
     * Last the root object is parsed.
     * 
     * @throws IOException
     */
    private void initialParse() throws IOException
    {
        final long startxrefOff = getStartxrefOffset();
            
        // ---- parse startxref
        setPdfSource( startxrefOff );
        parseStartXref();
        
        final long xrefOffset = document.getStartXref();
        long       prev       = xrefOffset;
            
        // ---- parse whole chain of xref tables/object streams using PREV reference
        while( prev > -1 )
        {
            // seek to xref table
            setPdfSource( prev );
            
            // -- parse xref
            if ( pdfSource.peek() == 'x' )
            {
                // xref table and trailer
                // use existing parser to parse xref table
                parseXrefTable( prev );
                
                // parse the last trailer.
                if ( ! parseTrailer() )
                {
                    throw new IOException( "Expected trailer object at position: " + pdfSource.getOffset() );
                }
                COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                prev = trailer.getInt( COSName.PREV );
            }
            else
            {
                // xref stream
                prev = parseXrefObjStream( prev );
            }
        }
        
        // ---- build valid xrefs out of the xref chain
        xrefTrailerResolver.setStartxref( xrefOffset );
        document.setTrailer( xrefTrailerResolver.getTrailer() );
            
        // ---- prepare encryption if necessary
        COSBase trailerEncryptItem = document.getTrailer().getItem( COSName.ENCRYPT );
        if ( trailerEncryptItem != null ) 
        {
            if ( trailerEncryptItem instanceof COSObject )
            {
                COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
                parseObjectDynamically( trailerEncryptObj, true );
            }
            
            try
            {
                PDEncryptionDictionary encParameters = new PDEncryptionDictionary( document.getEncryptionDictionary() );
                      
                DecryptionMaterial decryptionMaterial = null;
                if( keyStoreFilename != null )
                {
                    KeyStore ks = KeyStore.getInstance( "PKCS12" );
                    ks.load( new FileInputStream( keyStoreFilename ), password.toCharArray() );
                    
                    decryptionMaterial = new PublicKeyDecryptionMaterial( ks, alias, password );
                }
                else
                {
                    decryptionMaterial = new StandardDecryptionMaterial( password );
                }
                      
                securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler( encParameters.getFilter() );
                securityHandler.prepareForDecryption( encParameters, document.getDocumentID(), decryptionMaterial );
                      
                AccessPermission permission = securityHandler.getCurrentAccessPermission();
                if ( ! permission.canExtractContent() )
                {
                    LOG.warn( "PDF file '" + pdfFile.getPath() + "' does not allow extracting content." );
                }
                  
            }
            catch ( Exception e )
            {
                throw new IOException( "Error (" + e.getClass().getSimpleName() + 
                        ") while creating security handler for decryption: " +
                                                   e.getMessage() /*, e // TODO: remove remark with Java 1.6 */);
            }
        }
    
        // ---- parse catalog or root object
        COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem( COSName.ROOT );
        
        if ( root == null )
        {
            throw new IOException( "Missing root object specification in trailer." );
        }
            
        parseObjectDynamically( root, false );
        
        // ---- resolve all objects (including pages)
        if ( ! parseMinimalCatalog )
        {
            COSObject catalogObj = document.getCatalog();
            if ( catalogObj != null )
            {
                if ( catalogObj.getObject() instanceof COSDictionary )
                {
                    parseDictObjects( (COSDictionary) catalogObj.getObject(), (COSName[]) null );
                    allPagesParsed = true;
                    document.setDecrypted();
                }
            }
        }
        initialParseDone = true;
    }
    
    // ------------------------------------------------------------------------
    /** Parses an xref object stream starting with indirect object id.
     *  
     *  @return value of PREV item in dictionary or <code>-1</code> if no such item exists
     */
    private long parseXrefObjStream( long objByteOffset ) throws IOException
    {
        // ---- parse indirect object head
        readInt();
        readInt();
        readPattern( OBJ_MARKER );
        
        COSDictionary dict       = parseCOSDictionary();
        COSStream     xrefStream = parseCOSStream(dict, getDocument().getScratchFile() );
        parseXrefStream( xrefStream, (int) objByteOffset );
        
        return dict.getLong( COSName.PREV );
    }


    // ------------------------------------------------------------------------
    /** Get current offset in file at which next byte would be read. */
    private final long getPdfSourceOffset()
    {
        return pdfSource.getOffset();
    }


    /** Sets {@link #pdfSource} to start next parsing at given file offset. */
    private final void setPdfSource( long fileOffset ) throws IOException
    {
        
        pdfSource.seek( fileOffset );


        // alternative using 'old fashioned' input stream
        //        if ( pdfSource != null )
        //            pdfSource.close();
        //        
        //        pdfSource = new PushBackInputStream(
        //                            new BufferedInputStream(
        //                                new FileInputStream( file ), 16384),  4096);
        //        pdfSource.skip( _fileOffset );
    }


    /** Enable handling of alternative pdfSource implementation. */
    private final void releasePdfSourceInputStream() throws IOException
    {
        //        if ( pdfSource != null )
        //            pdfSource.close();
    }


    private final void closeFileStream() throws IOException 
    {
        if ( pdfSource != null )
        {
            pdfSource.close();
        }
    }


    // ------------------------------------------------------------------------
    /** Looks for and parses startxref. We first look for last '%%EOF' marker
     *  (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via
     *  {@link #setEOFLookupRange(int)}) and go back to find <code>startxref</code>. */
    private final long getStartxrefOffset() throws IOException
    {
        byte[] buf; 
        long   skipBytes;
        
        // ---- read trailing bytes into buffer
        final long fileLen = pdfFile.length();
        
        FileInputStream fIn = null;
        try 
        {
            fIn = new FileInputStream( pdfFile );
            
            final int trailByteCount = ( fileLen < readTrailBytes ) ? (int) fileLen : readTrailBytes;
            buf = new byte[ trailByteCount ]; 
            fIn.skip( skipBytes = fileLen - trailByteCount );
            
            int off = 0;
            int readBytes;
            while ( off < trailByteCount )
            {
                readBytes = fIn.read( buf, off, trailByteCount - off );
                // in order to not get stuck in a loop we check readBytes (this should never happen)
                if ( readBytes < 1 )
                {
                    throw new IOException( "No more bytes to read for trailing buffer, but expected: " + 
                                                               ( trailByteCount - off ) );
                }
                off += readBytes;
            }
        }
        finally
        {
            if ( fIn != null )
            {
                try 
                { 
                    fIn.close(); 
                } 
                catch ( IOException ioe ) 
                {}
            }
        }
            
        // ---- find last '%%EOF'
        int bufOff = lastIndexOf( EOF_MARKER, buf, buf.length );
        
        if ( bufOff < 0 )
        {
            throw new IOException( "Missing end of file marker '" + ( new String( EOF_MARKER ) ) + "'" );
        }   
        // ---- find last startxref preceding EOF marker
        bufOff = lastIndexOf( STARTXREF_MARKER, buf, bufOff );
        
        if ( bufOff < 0 )
        {
            throw new IOException( "Missing 'startxref' marker." );
        }
        return skipBytes + bufOff;
    }


    // ------------------------------------------------------------------------
    /** Searches last appearance of pattern within buffer. Lookup before _lastOff
     *  and goes back until 0.
     *  
     *  @param pattern  pattern to search for
     *  @param buf      buffer to search pattern in
     *  @param endOff   offset (exclusive) where lookup starts at
     *  
     *  @return  start offset of pattern within buffer or <code>-1</code> if pattern could not be found 
     */
    private final int lastIndexOf( final char[] pattern, final byte[] buf, final int endOff )
    {
        final int lastPatternChOff = pattern.length - 1;
        
        int  bufOff   = endOff;
        int  patOff   = lastPatternChOff;
        char lookupCh = pattern[ patOff ];
        
        while ( --bufOff >= 0 ) 
        {
            if ( buf[ bufOff ] == lookupCh ) 
            {
                if ( --patOff < 0 )
                {
                    // whole pattern matched
                    return bufOff;
                }
                // matched current char, advance to preceding one
                lookupCh = pattern[ patOff ];
            }
            else if ( patOff < lastPatternChOff )
            {
                // no char match but already matched some chars; reset 
                lookupCh = pattern[ patOff = lastPatternChOff ];
            }
        }
        
        return -1;
    }


    // ------------------------------------------------------------------------
    /** Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end.
     * 
     * @throws IOException if pattern could not be read
     */
    private final void readPattern( final char[] pattern ) throws IOException
    {
        skipSpaces();
        
        for ( char c : pattern )
        {
            if ( pdfSource.read() != c )
            {
                throw new IOException( "Expected pattern '" + new String( pattern )  +
                        " but missed at character '" + c + "'" );
            }
        }
        
        skipSpaces();
    }


    // ------------------------------------------------------------------------
    private COSDictionary pagesDictionary = null;
    
    /** Returns PAGES {@link COSDictionary} object or throws {@link IOException}
     *  if PAGES dictionary does not exist. */
    private COSDictionary getPagesObject() throws IOException 
    {
        if ( pagesDictionary != null )
        {
            return pagesDictionary;
        }   
        COSObject pages = (COSObject) document.getCatalog().getItem( COSName.PAGES );
            
        if ( pages == null )
        {
            throw new IOException( "Missing PAGES entry in document catalog." );
        }
            
        COSBase object = parseObjectDynamically( pages, false );
            
        if ( ! ( object instanceof COSDictionary ) )
        {
            throw new IOException( "PAGES not a dictionary object, but: " +
                    object.getClass().getSimpleName() );
        }
            
        pagesDictionary = (COSDictionary) object;
        
        return pagesDictionary;
    }


    // ------------------------------------------------------------------------
    /** Parses all objects needed by pages and closes input stream. */
    /**
     * {@inheritDoc}
     */
    @Override 
    public void parse() throws IOException 
    {
        boolean exceptionOccurred = true;    // set to false if all is processed
        
        try
        {
            if ( ! initialParseDone )
            {
                initialParse();
            }
                
            final int pageCount = getPageNumber();
            
            if ( ! allPagesParsed )
            {
                for ( int pNr = 0; pNr < pageCount; pNr++ )
                {
                    getPage( pNr );
                }
                allPagesParsed = true;
                document.setDecrypted();
            }
            
            exceptionOccurred = false;
        }
        finally
        {
            try 
            { 
                closeFileStream();
            } 
            catch ( IOException ioe ) 
            {}
                    
            if ( exceptionOccurred && ( document != null ) )
            {
                try 
                {
                    document.close(); 
                } 
                catch ( IOException ioe ) 
                {}
            }
        }
    }   


    // ------------------------------------------------------------------------
    /** 
     * Returns security handler of the document or <code>null</code> if document
     * is not encrypted or {@link #parse()} wasn't called before. 
     *
     * @return the security handler.
     */
    public SecurityHandler getSecurityHandler() 
    {
        return securityHandler;
    }


    // ------------------------------------------------------------------------
    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * Overwriting super method was necessary in order to set security handler.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    @Override
    public PDDocument getPDDocument() throws IOException
    {
        PDDocument pdDocument = super.getPDDocument();
        if ( securityHandler != null )
            pdDocument.setSecurityHandler( securityHandler );
        
        return pdDocument;
    }


    // ------------------------------------------------------------------------
    /**
     * Returns the number of pages in a document.
     * 
     * @return the number of pages.
     * 
     * @throws IOException  if PAGES or other needed object is missing 
     */
    public int getPageNumber() throws IOException
    {
        int pageCount = getPagesObject().getInt( COSName.COUNT );
        
        if ( pageCount < 0 )
        {
            throw new IOException( "No page number specified." );
        }   
        return pageCount;
    }


    // ------------------------------------------------------------------------
    /**
     * Returns the page requested with all the objects loaded into it.
     * 
     * @param pageNr starts from 0 to the number of pages.
     * @return the page with the given pagenumber.
     * @throws IOException If something went wrong.
     */
    public PDPage getPage( int pageNr ) throws IOException
    {
        getPagesObject();
        
        // ---- get list of top level pages
        COSArray kids = (COSArray) pagesDictionary.getDictionaryObject( COSName.KIDS );
        
        if ( kids == null )
        {
            throw new IOException( "Missing 'Kids' entry in pages dictionary." );
        }
            
        // ---- get page we are looking for (possibly going recursively into subpages)
        COSObject pageObj = getPageObject( pageNr, kids, 0 );
        
        if ( pageObj == null )
        {
            throw new IOException( "Page " + pageNr + " not found." );
        }
            
        // ---- parse all objects necessary to load page.
        COSDictionary pageDict = (COSDictionary) pageObj.getObject();
        
        if ( parseMinimalCatalog && ( ! allPagesParsed ) )
        {
            // parse page resources since we did not do this on start
            COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject( COSName.RESOURCES );
            parseDictObjects( resDict );
        }
        
        return new PDPage( pageDict );
    }


    /**
     * Returns the object for a specific page.
     * The page tree is made up of kids.  The kids have COSArray with COSObjects
     * inside of them. The COSObject can be parsed using the dynamic parsing method
     * We want to only parse the minimum COSObjects and still return a complete page.
     * ready to be used.
     * 
     * @param num  the requested page number; numbering starts with 0
     * @param startKids Kids array to start with looking up page number
     * @param startPageCount
     * 
     * @return  page object or <code>null</code> if no such page exists
     * 
     * @throws IOException
     */
    private COSObject getPageObject( int num, COSArray startKids, int startPageCount ) throws IOException
    {
        int               curPageCount = startPageCount;
        Iterator<COSBase> kidsIter     = startKids.iterator();
        
        while( kidsIter.hasNext() )
        {
            COSObject obj  = (COSObject) kidsIter.next();
            COSBase   base = obj.getObject();
            if( base == null )
            {
                base = parseObjectDynamically( obj, false );
                obj.setObject( base );
            }
            
            COSDictionary dic   = (COSDictionary) base;
            int           count = dic.getInt( COSName.COUNT );
            if ( count >= 0 ) 
            {
                // skip this branch if requested page comes later
                if( ( curPageCount + count ) <= num ) 
                {
                    curPageCount += count;
                    continue;
                }
            }
                    
            COSArray kids = (COSArray) dic.getDictionaryObject( COSName.KIDS );
            if( kids != null)
            {
                // recursively scan subpages
                COSObject ans = getPageObject( num, kids, curPageCount );
                // if ans is not null, we got what we were looking for
                if( ans != null )
                {
                    return ans;
                }
            }
            else
            {
                // found page?
                if( curPageCount == num ) 
                {
                    return obj;
                }
                // page has no kids and it is not the page we are looking for 
                curPageCount++;
            }
        }
        return null;
    }


    /** Creates a unique object id using object number and object generation number. 
     *  (requires object number < 2^31)) */
    private final long getObjectId( final COSObject obj ) 
    {
        return ( obj.getObjectNumber().longValue() << 32 ) | obj.getGenerationNumber().longValue();
    }
    
    /** Adds all from newObjects to toBeParsedList if it is not an COSObject
     *  or we didn't add this COSObject already (checked via addedObjects). */
    private final void addNewToList( final Queue<COSBase> toBeParsedList,
                                         final Collection<COSBase> newObjects,
                                         final Set<Long> addedObjects )
    {
        for ( COSBase newObject : newObjects )
        {
            if ( newObject instanceof COSObject ) 
            {
                final long objId  = getObjectId( (COSObject) newObject );
                if ( ! addedObjects.add( objId ) )
                {
                    continue;
                }
            }
            toBeParsedList.add( newObject );
        }
    }


    /** Adds newObject to toBeParsedList if it is not an COSObject
     *  or we didn't add this COSObject already (checked via addedObjects). */
    private final void addNewToList( final Queue<COSBase> toBeParsedList,
                                         final COSBase newObject,
                                         final Set<Long> addedObjects )
    {
        if ( newObject instanceof COSObject ) 
        {
            final long objId  = getObjectId( (COSObject) newObject );
            if ( ! addedObjects.add( objId ) )
            {
                return;
            }
        }
        toBeParsedList.add( newObject );
    }


    /**
     * Will parse every object necessary to load a single page from the pdf document.
     * We try our best to order objects according to offset in file before reading
     * to minimize seek operations.
     * 
     * @param dict the COSObject from the parent pages.
     * @param excludeObjects dictionary object reference entries with these names will not be parsed
     * 
     * @throws IOException
     */
    private void parseDictObjects( COSDictionary dict, COSName... excludeObjects ) throws IOException 
    {
        // ---- create queue for objects waiting for further parsing
        final Queue<COSBase>                toBeParsedList = new LinkedList<COSBase>();
        // offset ordered object map
        final TreeMap<Long,List<COSObject>> objToBeParsed  = new TreeMap<Long, List<COSObject>>();  
        // in case of compressed objects offset points to stmObj
        final Set<Long>                     parsedObjects  = new HashSet<Long>();       
        final Set<Long>                     addedObjects   = new HashSet<Long>();       
        
        // ---- add objects not to be parsed to list of already parsed objects
        if ( excludeObjects != null ) 
        {
            for ( COSName objName : excludeObjects ) 
            {
                COSBase baseObj = dict.getItem( objName );
                if ( baseObj instanceof COSObject ) 
                {
                    parsedObjects.add( getObjectId( (COSObject) baseObj ) );
                }
            }
        }
            
        addNewToList( toBeParsedList, dict.getValues(), addedObjects );
        
        // ---- go through objects to be parsed
        while( ! ( toBeParsedList.isEmpty() && objToBeParsed.isEmpty() ) )
        {
            // -- first get all COSObject from other kind of objects and
            //    put them in objToBeParsed; afterwards toBeParsedList is empty
            COSBase baseObj;
            while ( ( baseObj = toBeParsedList.poll() ) != null ) 
            {
                if ( baseObj instanceof COSStream )
                {
                    addNewToList( toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects );
                }
                else if ( baseObj instanceof COSDictionary )
                {
                    addNewToList( toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects );
                }
                else if ( baseObj instanceof COSArray )
                {
                    final Iterator<COSBase> arrIter = ( (COSArray) baseObj ).iterator();
                    while ( arrIter.hasNext() )
                    {
                        addNewToList( toBeParsedList, arrIter.next(), addedObjects );
                    }
                }
                else if ( baseObj instanceof COSObject )
                {
                    COSObject    obj    = (COSObject) baseObj;
                    long         objId  = getObjectId( obj );
                    COSObjectKey objKey = new COSObjectKey( obj.getObjectNumber().intValue(),
                            obj.getGenerationNumber().intValue() );
                                    
                    if ( ! ( parsedObjects.contains( objId ) /*|| document.hasObjectInPool( objKey ) */ ) )
                    {
                        Long fileOffset = xrefTrailerResolver.getXrefTable().get( objKey );
                        //  it is allowed that object references point to null, thus we have to test
                        if ( fileOffset != null )  
                        {
                            if ( fileOffset > 0 )
                            {
                                objToBeParsed.put( fileOffset, Collections.singletonList( obj ) );
                            }
                            else 
                            {
                                // negative offset means we have a compressed object within object stream;
                                // get offset of object stream
                                fileOffset = xrefTrailerResolver.getXrefTable().get( new COSObjectKey( -fileOffset, 0 ) );
                                if ( ( fileOffset == null ) || ( fileOffset <= 0 ) )
                                {
                                    throw new IOException( "Invalid object stream xref object reference: " + fileOffset );
                                }
                                
                                List<COSObject> stmObjects = objToBeParsed.get( fileOffset );
                                if ( stmObjects == null )
                                {
                                    objToBeParsed.put( fileOffset, stmObjects = new ArrayList<COSObject>() );
                                }
                                stmObjects.add( obj );
                            }
                        }
                        else
                        {
                            // NULL object
                            COSObject pdfObject = document.getObjectFromPool( objKey );
                            pdfObject.setObject( COSNull.NULL );
                        }
                    }
                }
            }
            
            // ---- read first COSObject with smallest offset;
            //      resulting object will be added to toBeParsedList
            if ( objToBeParsed.isEmpty() )
            {
                break;
            }
        
            for ( COSObject obj : objToBeParsed.remove( objToBeParsed.firstKey() ) )
            {
                COSBase  parsedObj = parseObjectDynamically( obj, false );
                
                obj.setObject( parsedObj );
                addNewToList( toBeParsedList, parsedObj, addedObjects );
                
                parsedObjects.add( getObjectId( obj ) );
            }
        }
    }
    
    /**
     * This will parse the next object from the stream and add it to 
     * the local state. 
     * This is taken from {@link PDFParser} and reduced to parsing
     * an indirect object.
     *
     * @param  obj object to be parsed (we only take object number and generation number for lookup start offset)
     * @param  requireExistingNotCompressedObj  if <code>true</code> object to be parsed must 
     *          not be contained within compressed stream
     * @return  the parsed object (which is also added to document object)
     * 
     * @throws IOException If an IO error occurs.
     */
    private COSBase parseObjectDynamically( COSObject obj, boolean requireExistingNotCompressedObj )
    throws IOException
    {
        return parseObjectDynamically( obj.getObjectNumber().intValue(),
                obj.getGenerationNumber().intValue(),
                requireExistingNotCompressedObj );
    }


    /**
     * This will parse the next object from the stream and add it to 
     * the local state. 
     * This is taken from {@link PDFParser} and reduced to parsing
     * an indirect object.
     *
     * @param  objNr object number of object to be parsed
     * @param  objGenNr object generation number of object to be parsed
     * @param requireExistingNotCompressedObj  if <code>true</code> the object to be parsed must be defined
     *                                          in xref (comment: null objects may be missing from xref) and
     *                                          it must not be a compressed object within object stream
     *                                          (this is used to circumvent being stuck in a loop in a malicious PDF) 
     * 
     * @return  the parsed object (which is also added to document object)
     * 
     * @throws IOException If an IO error occurs.
     */
    private COSBase parseObjectDynamically( int objNr, int objGenNr, boolean requireExistingNotCompressedObj )
    throws IOException
    {
        // ---- create object key and get object (container) from pool
        final COSObjectKey objKey    = new COSObjectKey( objNr, objGenNr );
        final COSObject    pdfObject = document.getObjectFromPool( objKey );
            
        if ( pdfObject.getObject() == null )
        {
            // not previously parsed
            // ---- read offset or object stream object number from xref table
            Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get( objKey );
            
            // sanity test to circumvent loops with broken documents
            if ( requireExistingNotCompressedObj &&
                    ( ( offsetOrObjstmObNr == null ) || ( offsetOrObjstmObNr <= 0 ) ) )
            {   
                throw new IOException( "Object must be defined and must not be compressed object: " + 
                        objKey.getNumber() + ":" + objKey.getGeneration() );
            }
        
            if ( offsetOrObjstmObNr == null )
            {
                // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
                pdfObject.setObject( COSNull.NULL );
            }
            else if ( offsetOrObjstmObNr > 0 )
            {
                // offset of indirect object in file
                // ---- go to object start
                setPdfSource( offsetOrObjstmObNr );
                
                // ---- we must have an indirect object
                final int readObjNr  = readInt();
                final int readObjGen = readInt();
                readPattern( OBJ_MARKER );
                
                // ---- consistency check
                if ( ( readObjNr != objKey.getNumber() ) ||
                        ( readObjGen != objKey.getGeneration() ) ) 
                {
                    throw new IOException( "XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() +
                            " points to wrong object: " + readObjNr + ":" + readObjGen );
                }
                
                skipSpaces();
                COSBase pb           = parseDirObject();
                String  endObjectKey = readString();
                
                if ( endObjectKey.equals( "stream" ) ) 
                {
                    pdfSource.unread( endObjectKey.getBytes("ISO-8859-1") );
                    pdfSource.unread( ' ' );
                    if( pb instanceof COSDictionary )
                    {
                        COSStream stream = parseCOSStream( (COSDictionary)pb,
                                getDocument().getScratchFile() );
                                     
                        if ( securityHandler != null )
                        {
                            try 
                            {
                                securityHandler.decryptStream(stream, objNr, objGenNr );
                            } 
                            catch ( CryptographyException ce ) 
                            {
                                throw new IOException( "Error decrypting stream object " + objNr + ": " + ce.getMessage()
                                /*, ce // TODO: remove remark with Java 1.6 */ );
                            }
                        }
                        pb = stream;
                    }
                    else
                    {
                        // this is not legal
                        // the combination of a dict and the stream/endstream forms a complete stream object
                        throw new IOException( "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")." );
                    }
                    skipSpaces();
                    endObjectKey = readLine();
                    
                    // we have case with a second 'endstream' before endobj
                    if ( ! endObjectKey.startsWith( "endobj" ) )
                    {
                        if ( endObjectKey.startsWith( "endstream" ) ) 
                        {
                            endObjectKey = endObjectKey.substring( 9 ).trim();
                            if ( endObjectKey.length() == 0 )
                            {
                                // no other characters in extra endstream line
                                endObjectKey = readLine();    // read next line 
                            }
                        }
                    }
                }
                else if ( securityHandler != null )
                {
                    // decrypt
                    if ( pb instanceof COSString )
                    {
                        decrypt( (COSString) pb, objNr, objGenNr );
                    }
                    else if ( pb instanceof COSDictionary )
                    {
                        for( Entry<COSName,COSBase> entry : ((COSDictionary) pb).entrySet() )
                        {
                            // TODO: specially handle 'Contents' entry of signature dictionary like in SecurityHandler#decryptDictionary
                            if ( entry.getValue() instanceof COSString )
                            {
                                decrypt( (COSString) entry.getValue(), objNr, objGenNr );
                            }
                        }
                    }
                    else if ( pb instanceof COSArray )
                    {
                        final COSArray array = (COSArray) pb;
                        for( int aIdx = 0, len = array.size(); aIdx < len; aIdx++ )
                        {
                            if ( array.get( aIdx ) instanceof COSString )
                            {
                                decrypt( (COSString) array.get( aIdx ), objNr, objGenNr );
                            }
                        }
                    }
                }
                      
                pdfObject.setObject( pb );
                
                if ( ! endObjectKey.startsWith( "endobj" ) )
                {
                    throw new IOException( "Object (" + readObjNr + ":" + readObjGen +
                            ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'." );
                }
                
                releasePdfSourceInputStream();
                        
            }
            else
            {
                // xref value is object nr of object stream containing object to be parsed;
                // since our object was not found it means object stream was not parsed so far
                final int     objstmObjNr   = (int) ( - offsetOrObjstmObNr );
                final COSBase objstmBaseObj = parseObjectDynamically( objstmObjNr, 0, true );
                if ( objstmBaseObj instanceof COSStream )
                {
                    // parse object stream
                    PDFObjectStreamParser parser =
                        new PDFObjectStreamParser( (COSStream) objstmBaseObj, document, forceParsing );
                    parser.parse();
                    
                    // get set of object numbers referenced for this object stream
                    final Set<Long> refObjNrs = xrefTrailerResolver.getContainedObjectNumbers( objstmObjNr );
                    
                    // register all objects which are referenced to be contained in object stream
                    for( COSObject next : parser.getObjects() )
                    {
                        COSObjectKey stmObjKey = new COSObjectKey( next );
                        if ( refObjNrs.contains( stmObjKey.getNumber() ) )
                        {
                            COSObject stmObj = document.getObjectFromPool( stmObjKey );
                            stmObj.setObject( next.getObject() );
                        }
                    }
                }
            }
        }   
        return pdfObject.getObject();
    }
    
    // ------------------------------------------------------------------------
    /** Decrypts given COSString. */
    private final void decrypt( COSString str, long objNr, long objGenNr )
    throws IOException
    {
        try 
        {
            securityHandler.decryptString( str, objNr, objGenNr );
        }
        catch ( CryptographyException ce )
        {
            throw new IOException( "Error decrypting string: " + ce.getMessage()
            /*, ce // TODO: remove remark with Java 1.6 */ );
        }   
    }
  
    // ------------------------------------------------------------------------
    private boolean inGetLength = false;
    
    /** Returns length value referred to or defined in given object. */
    private COSNumber getLength( final COSBase lengthBaseObj ) throws IOException
    {
        if ( lengthBaseObj == null )
        {
            return null;
        }
        
        if ( inGetLength )
        {
            throw new IOException( "Loop while reading length from " + lengthBaseObj );
        }
        
        COSNumber retVal = null;
        
        try
        {
            inGetLength = true;
            
            // ---- maybe length was given directly
            if ( lengthBaseObj instanceof COSNumber )
            {
                retVal = (COSNumber) lengthBaseObj;
            }
            // ---- length in referenced object
            else if ( lengthBaseObj instanceof COSObject )
            {
                COSObject lengthObj = (COSObject) lengthBaseObj;
                
                if ( lengthObj.getObject() == null ) 
                {
                    // not read so far
                    
                    // keep current stream position
                    final long curFileOffset = getPdfSourceOffset();
                    releasePdfSourceInputStream();
                    
                    parseObjectDynamically( lengthObj, true );
                    
                    // reset current stream position
                    setPdfSource( curFileOffset );
                    
                    if ( lengthObj.getObject() == null )
                    {
                        throw new IOException( "Length object content was not read." );
                    }
                }
                
                if ( ! ( lengthObj.getObject() instanceof COSNumber ) )
                {
                    throw new IOException( "Wrong type of referenced length object " + lengthObj + ": " + 
                            lengthObj.getObject().getClass().getSimpleName() );
                }
                            
                retVal = (COSNumber) lengthObj.getObject();
                
            }
            else
            {
                throw new IOException( "Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName() );
            }
        }
        finally
        {
            inGetLength = false;
        }
        return retVal;
    }
  
    // ------------------------------------------------------------------------
    private final int    streamCopyBufLen = 8192;
    private final byte[] streamCopyBuf    = new byte[ streamCopyBufLen ];
    
    /**
     * This will read a COSStream from the input stream using length attribute
     * within dictionary.
     * If length attribute is a indirect reference it is first resolved to get
     * the stream length. This means we copy stream data without testing for
     * 'endstream' or 'endobj' and thus it is no problem if these keywords
     * occur within stream.
     * We require 'endstream' to be found after stream data is read. 
     *
     * @param dic  dictionary that goes with this stream.
     * @param file  file to write the stream to when reading.
     *
     * @return parsed pdf stream.
     *
     * @throws IOException if an error occurred reading the stream, like problems
     *         with reading length attribute, stream does not end with 'endstream'
     *         after data read, stream too short etc.
     */
    @Override
    protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException
    {
        final COSStream stream = new COSStream( dic, file );
        OutputStream out = null;
        try
        {
            readString();    // read 'stream'; this was already tested in parseObjectsDynamically()
            
            // ---- skip whitespaces before start of data
            //      PDF Ref 1.7, chap. 3.2.7:
            //      'stream' should be followed by either a CRLF (0x0d 0x0a) or LF but nothing else.
            {
                int whitespace = pdfSource.read();
                
                //see brother_scan_cover.pdf, it adds whitespaces
                //after the stream but before the start of the
                //data, so just read those first
                while (whitespace == 0x20)
                {
                    whitespace = pdfSource.read();
                }
                
                if( whitespace == 0x0D )
                {
                    whitespace = pdfSource.read();
                    if( whitespace != 0x0A )
                    {
                        // the spec says this is invalid but it happens in the real
                        // world so we must support it
                        pdfSource.unread( whitespace );
                    }
                }
                else if (whitespace != 0x0A)
                {
                    // no whitespace after 'stream'; PDF ref. says 'should' so that is ok
                    pdfSource.unread( whitespace );
                }
            } 
            
            /*This needs to be dic.getItem because when we are parsing, the underlying object
             * might still be null.
             */
            COSNumber streamLengthObj = getLength( dic.getItem( COSName.LENGTH ) );
            if ( streamLengthObj == null )
            {
                      throw new IOException( "Missing length for stream." );
            }
              
            // ---- get output stream to copy data to
            out = stream.createFilteredStream( streamLengthObj );
            
            long remainBytes = streamLengthObj.longValue();
            
            while ( remainBytes > 0 )
            {
                final int readBytes = pdfSource.read( streamCopyBuf, 0, 
                        ( remainBytes > streamCopyBufLen ) ? streamCopyBufLen : (int) remainBytes );
                if ( readBytes <= 0 )
                {
                    throw new IOException( "No more bytes from stream but expected: " + remainBytes );
                }     
                out.write( streamCopyBuf, 0, readBytes );
                
                remainBytes -= readBytes;
            }
            
            String endStream = readString();
            
            if ( ! endStream.equals( "endstream" ) )
            {
                throw new IOException( "Error reading stream using length value. Expected='endstream' actual='" 
                        + endStream + "' " );
            } 
            
        } 
        finally 
        {
            if ( out != null ) 
            {
                out.close();
            }
        }
        return stream;
    }
}
Source Code of org.apache.pdfbox.pdfparser.NonSequentialPDFParser

Related Classes of org.apache.pdfbox.pdfparser.NonSequentialPDFParser