Package org.apache.pdfbox.pdfparser

Source Code of org.apache.pdfbox.pdfparser.NonSequentialPDFParser

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;


import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.KeyStore;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
* PDFParser which first reads startxref and xref tables in order to know valid
* objects and parse only these objects. Thus it is closer to a conforming parser
* than the sequential reading of {@link PDFParser}.
*
* This class can be used as a {@link PDFParser} replacement. First {@link #parse()}
* must be called before page objects can be retrieved, e.g. {@link #getPDDocument()}.
*
* This class is a much enhanced version of <code>QuickParser</code> presented in
* <a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a>
* by Jeremy Villalobos.
*/
public class NonSequentialPDFParser extends PDFParser
{
 
    public static final String SYSPROP_PARSEMINIMAL =
        "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
    public static final String SYSPROP_EOFLOOKUPRANGE =
        "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";
       
    private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream( new byte[0] );
   
    protected static final int    DEFAULT_TRAIL_BYTECOUNT = 2048;
    protected static final char[] EOF_MARKER              = new char[] { '%','%','E','O','F' };
    protected static final char[] STARTXREF_MARKER        = new char[] { 's','t','a','r','t','x','r','e','f' };
    protected static final char[] OBJ_MARKER              = new char[] { 'o','b','j' };

    private final File pdfFile;
    private final RandomAccessBufferedFileInputStream raStream;
   
    protected SecurityHandler securityHandler = null;
   
    private String keyStoreFilename = null;
    private String alias            = null;
    private String password         = "";
    private int    readTrailBytes   = DEFAULT_TRAIL_BYTECOUNT;    // how many trailing bytes to read for EOF marker
   
    /** If <code>true</code> object references in catalog are not followed;
     *  pro: page objects will be only parsed when needed; cons: some information of catalog
     *  might not be available (e.g. outline).
     *  Catalog parsing without pages is not an option since a number of entries will
     *  also refer to page objects (like OpenAction).
     */
    private boolean parseMinimalCatalog = "true".equals( System.getProperty( SYSPROP_PARSEMINIMAL ) );
   
    private boolean initialParseDone = false;
    private boolean allPagesParsed   = false;
       
    private static final Log LOG = LogFactory.getLog( NonSequentialPDFParser.class );

    /**
     * <code>true</code> if the NonSequentialPDFParser is initialized by a InputStream, in this case
     * a temporary file is created. At the end of the {@linkplain #parse()} method,the temporary file will
     * be deleted.
     */
    private boolean isTmpPDFFile = false;
   
    public static final String TMP_FILE_PREFIX = "tmpPDF";
   
    // ------------------------------------------------------------------------
    /**
     * Constructs parser for given file using memory buffer.
     *
     * @param filename the filename of the pdf to be parsed
     *
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser( String filename ) throws IOException
    {
      this( new File( filename ), null );
    }

    /**
     * Constructs parser for given file using given buffer for temporary storage.
     *
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * 
     * @throws IOException If something went wrong.
     */
    /**
     * Constructs parser for given file using given buffer for temporary storage.
     *
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser( File file, RandomAccess raBuf ) throws IOException
    {
      this(file, raBuf, "");
    }

    /**
     * Constructs parser for given file using given buffer for temporary storage.
     *
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * 
     * @throws IOException If something went wrong.
     */
    /**
     * Constructs parser for given file using given buffer for temporary storage.
     *
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * @param decryptionPassword password to be used for decryption
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser( File file, RandomAccess raBuf, String decryptionPassword ) throws IOException
    {
      super( EMPTY_INPUT_STREAM, null, false );
      pdfFile = file;
      raStream = new RandomAccessBufferedFileInputStream( pdfFile );
      init(file, raBuf, decryptionPassword);
    }

    private void init(File file, RandomAccess raBuf, String decryptionPassword) throws IOException {
      String eofLookupRangeStr = System.getProperty( SYSPROP_EOFLOOKUPRANGE );
      if ( eofLookupRangeStr != null )
      {
        try
        {
          setEOFLookupRange( Integer.parseInt( eofLookupRangeStr ) );
        }
        catch ( NumberFormatException nfe )
        {
          LOG.warn( "System property " + SYSPROP_EOFLOOKUPRANGE +
              " does not contain an integer value, but: '" + eofLookupRangeStr + "'" );
        }
      }

      setDocument( ( raBuf == null ) ? new COSDocument( new RandomAccessBuffer(), false ) new COSDocument( raBuf, false ) );

      pdfSource = new PushBackInputStream( raStream, 4096 );

      password = decryptionPassword;
    }

    public NonSequentialPDFParser(InputStream input) throws IOException
    {
      super( EMPTY_INPUT_STREAM, null, false );
      pdfFile = createTmpFile(input);
      raStream = new RandomAccessBufferedFileInputStream( pdfFile );
      init(pdfFile, null, "");
    }

    /**
     * Create a temporary file with the input stream.
     * If the creation succeed, the {@linkplain #isTmpPDFFile} is set to true.
     * This Temporary file will be deleted at end of the parse method
     * @param input
     * @return
     * @throws IOException
     */
    private File createTmpFile(InputStream input) throws IOException {
      File tmpFile = null;
      FileOutputStream fos = null;
      try {
        tmpFile = File.createTempFile(TMP_FILE_PREFIX, ".pdf");
        fos = new FileOutputStream(tmpFile);
        IOUtils.copy(input, fos);
        isTmpPDFFile = true;
        return tmpFile;
      } finally {
        IOUtils.closeQuietly(input);
        IOUtils.closeQuietly(fos);
      }
    }

    // ------------------------------------------------------------------------
    /**
     *  Sets how many trailing bytes of PDF file are searched for
     *  EOF marker and 'startxref' marker.
     *  If not set we use default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
     * 
     *  <p<We check that new value is at least 16. However for practical use
     *  cases this value should not be lower than 1000; even 2000
     *  was found to not be enough in some cases where some trailing
     *  garbage like HTML snippets followed the EOF marker.</p>
    
     *  <p>In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined
     *  this value will be set on initialization but can be overwritten later.</p>
    
     *  @param byteCount number of trailing bytes
     */
    public void setEOFLookupRange( int byteCount )
    {
            if ( byteCount > 15 )
            {
                readTrailBytes = byteCount;
            }
    }
       
    // ------------------------------------------------------------------------
    /**
     * The initial parse will first parse only the trailer, the xrefstart and
     * all xref tables to have a pointer (offset) to all the pdf's objects.
     * It can handle linearized pdfs, which will have an xref at the
     * end pointing to an xref at the beginning of the file.
     * Last the root object is parsed.
     *
     * @throws IOException
     */
    protected void initialParse() throws IOException
    {
        final long startxrefOff = getStartxrefOffset();
           
        // ---- parse startxref
        setPdfSource( startxrefOff );
        parseStartXref();
       
        final long xrefOffset = document.getStartXref();
        long       prev       = xrefOffset;
           
        // ---- parse whole chain of xref tables/object streams using PREV reference
        while( prev > -1 )
        {
            // seek to xref table
            setPdfSource( prev );
           
            // -- parse xref
            if ( pdfSource.peek() == 'x' )
            {
                // xref table and trailer
                // use existing parser to parse xref table
                parseXrefTable( prev );
               
                // parse the last trailer.
                if ( ! parseTrailer() )
                {
                    throw new IOException( "Expected trailer object at position: " + pdfSource.getOffset() );
                }
                COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                prev = trailer.getInt( COSName.PREV );
            }
            else
            {
                // xref stream
                prev = parseXrefObjStream( prev );
            }
        }
       
        // ---- build valid xrefs out of the xref chain
        xrefTrailerResolver.setStartxref( xrefOffset );
        document.setTrailer( xrefTrailerResolver.getTrailer() );
           
        // ---- prepare encryption if necessary
        COSBase trailerEncryptItem = document.getTrailer().getItem( COSName.ENCRYPT );
        if ( trailerEncryptItem != null )
        {
          if ( trailerEncryptItem instanceof COSObject )
          {
            COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
            parseObjectDynamically( trailerEncryptObj, true );
          }
          try
          {
              PDEncryptionDictionary encParameters = new PDEncryptionDictionary( document.getEncryptionDictionary() );
                   
              DecryptionMaterial decryptionMaterial = null;
              if( keyStoreFilename != null )
              {
                  KeyStore ks = KeyStore.getInstance( "PKCS12" );
                  ks.load( new FileInputStream( keyStoreFilename ), password.toCharArray() );
                 
                  decryptionMaterial = new PublicKeyDecryptionMaterial( ks, alias, password );
              }
              else
              {
                  decryptionMaterial = new StandardDecryptionMaterial( password );
              }
                   
              securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler( encParameters.getFilter() );
              securityHandler.prepareForDecryption( encParameters, document.getDocumentID(), decryptionMaterial );
                   
              AccessPermission permission = securityHandler.getCurrentAccessPermission();
              if ( ! permission.canExtractContent() )
              {
                  LOG.warn( "PDF file '" + pdfFile.getPath() + "' does not allow extracting content." );
              }
               
          }
          catch ( Exception e )
          {
              throw new IOException( "Error (" + e.getClass().getSimpleName() +
                      ") while creating security handler for decryption: " +
                                                 e.getMessage() /*, e // TODO: remove remark with Java 1.6 */);
          }
        }
   
        // ---- parse catalog or root object
        COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem( COSName.ROOT );
       
        if ( root == null )
        {
            throw new IOException( "Missing root object specification in trailer." );
        }
           
        parseObjectDynamically( root, false );
       
        // ---- resolve all objects (including pages)
        if ( ! parseMinimalCatalog )
        {
            COSObject catalogObj = document.getCatalog();
            if ( catalogObj != null )
            {
                if ( catalogObj.getObject() instanceof COSDictionary )
                {
                    parseDictObjects( (COSDictionary) catalogObj.getObject(), (COSName[]) null );
                    allPagesParsed = true;
                    document.setDecrypted();
                }
            }
        }
        initialParseDone = true;
       
    }
   
    // ------------------------------------------------------------------------
    /** Parses an xref object stream starting with indirect object id.
     * 
     *  @return value of PREV item in dictionary or <code>-1</code> if no such item exists
     */
    private long parseXrefObjStream( long objByteOffset ) throws IOException
    {
        // ---- parse indirect object head
        readInt();
        readInt();
        readPattern( OBJ_MARKER );
       
        COSDictionary dict       = parseCOSDictionary();
        COSStream     xrefStream = parseCOSStream(dict, getDocument().getScratchFile() );
        parseXrefStream( xrefStream, (int) objByteOffset );
       
        return dict.getLong( COSName.PREV );
    }

    // ------------------------------------------------------------------------
    /** Get current offset in file at which next byte would be read. */
    private final long getPdfSourceOffset()
    {
        return pdfSource.getOffset();
    }

    /** Sets {@link #pdfSource} to start next parsing at given file offset. */
    protected final void setPdfSource( long fileOffset ) throws IOException
    {
       
        pdfSource.seek( fileOffset );

        // alternative using 'old fashioned' input stream
        //        if ( pdfSource != null )
        //            pdfSource.close();
        //       
        //        pdfSource = new PushBackInputStream(
        //                            new BufferedInputStream(
        //                                new FileInputStream( file ), 16384),  4096);
        //        pdfSource.skip( _fileOffset );
    }

    /** Enable handling of alternative pdfSource implementation. */
    protected final void releasePdfSourceInputStream() throws IOException
    {
        //        if ( pdfSource != null )
        //            pdfSource.close();
    }

    private final void closeFileStream() throws IOException
    {
        if ( pdfSource != null )
        {
            pdfSource.close();
        }
    }

    // ------------------------------------------------------------------------
    /** Looks for and parses startxref. We first look for last '%%EOF' marker
     *  (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via
     *  {@link #setEOFLookupRange(int)}) and go back to find <code>startxref</code>. */
    protected final long getStartxrefOffset() throws IOException
    {
        byte[] buf;
        long   skipBytes;
       
        // ---- read trailing bytes into buffer
        final long fileLen = pdfFile.length();
       
        FileInputStream fIn = null;
        try
        {
            fIn = new FileInputStream( pdfFile );
           
            final int trailByteCount = ( fileLen < readTrailBytes ) ? (int) fileLen : readTrailBytes;
            buf = new byte[ trailByteCount ];
            fIn.skip( skipBytes = fileLen - trailByteCount );
           
            int off = 0;
            int readBytes;
            while ( off < trailByteCount )
            {
                readBytes = fIn.read( buf, off, trailByteCount - off );
                // in order to not get stuck in a loop we check readBytes (this should never happen)
                if ( readBytes < 1 )
                {
                    throw new IOException( "No more bytes to read for trailing buffer, but expected: " +
                                                               ( trailByteCount - off ) );
                }
                off += readBytes;
            }
        }
        finally
        {
            if ( fIn != null )
            {
                try
                {
                    fIn.close();
                }
                catch ( IOException ioe )
                {}
            }
        }
           
        // ---- find last '%%EOF'
        int bufOff = lastIndexOf( EOF_MARKER, buf, buf.length );
       
        if ( bufOff < 0 )
        {
            throw new IOException( "Missing end of file marker '" + ( new String( EOF_MARKER ) ) + "'" );
        }  
        // ---- find last startxref preceding EOF marker
        bufOff = lastIndexOf( STARTXREF_MARKER, buf, bufOff );
       
        if ( bufOff < 0 )
        {
            throw new IOException( "Missing 'startxref' marker." );
        }
        return skipBytes + bufOff;
    }

    // ------------------------------------------------------------------------
    /** Searches last appearance of pattern within buffer. Lookup before _lastOff
     *  and goes back until 0.
     * 
     *  @param pattern  pattern to search for
     *  @param buf      buffer to search pattern in
     *  @param endOff   offset (exclusive) where lookup starts at
     * 
     *  @return  start offset of pattern within buffer or <code>-1</code> if pattern could not be found
     */
    protected int lastIndexOf( final char[] pattern, final byte[] buf, final int endOff )
    {
        final int lastPatternChOff = pattern.length - 1;
       
        int  bufOff   = endOff;
        int  patOff   = lastPatternChOff;
        char lookupCh = pattern[ patOff ];
       
        while ( --bufOff >= 0 )
        {
            if ( buf[ bufOff ] == lookupCh )
            {
                if ( --patOff < 0 )
                {
                    // whole pattern matched
                    return bufOff;
                }
                // matched current char, advance to preceding one
                lookupCh = pattern[ patOff ];
            }
            else if ( patOff < lastPatternChOff )
            {
                // no char match but already matched some chars; reset
                lookupCh = pattern[ patOff = lastPatternChOff ];
            }
        }
       
        return -1;
    }

    // ------------------------------------------------------------------------
    /** Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end.
     *
     * @throws IOException if pattern could not be read
     */
    protected final void readPattern( final char[] pattern ) throws IOException
    {
        skipSpaces();
       
        for ( char c : pattern )
        {
            if ( pdfSource.read() != c )
            {
                throw new IOException( "Expected pattern '" + new String( pattern +
                        " but missed at character '" + c + "'" );
            }
        }
       
        skipSpaces();
    }

    // ------------------------------------------------------------------------
    private COSDictionary pagesDictionary = null;
   
    /** Returns PAGES {@link COSDictionary} object or throws {@link IOException}
     *  if PAGES dictionary does not exist. */
    private COSDictionary getPagesObject() throws IOException
    {
        if ( pagesDictionary != null )
        {
            return pagesDictionary;
        }  
        COSObject pages = (COSObject) document.getCatalog().getItem( COSName.PAGES );
           
        if ( pages == null )
        {
            throw new IOException( "Missing PAGES entry in document catalog." );
        }
           
        COSBase object = parseObjectDynamically( pages, false );
           
        if ( ! ( object instanceof COSDictionary ) )
        {
            throw new IOException( "PAGES not a dictionary object, but: " +
                    object.getClass().getSimpleName() );
        }
           
        pagesDictionary = (COSDictionary) object;
       
        return pagesDictionary;
    }

    // ------------------------------------------------------------------------
    /** Parses all objects needed by pages and closes input stream. */
    /**
     * {@inheritDoc}
     */
    @Override
    public void parse() throws IOException
    {
        boolean exceptionOccurred = true;    // set to false if all is processed
       
        try
        {
            if ( ! initialParseDone )
            {
                initialParse();
            }
               
            final int pageCount = getPageNumber();
           
            if ( ! allPagesParsed )
            {
                for ( int pNr = 0; pNr < pageCount; pNr++ )
                {
                    getPage( pNr );
                }
                allPagesParsed = true;
                document.setDecrypted();
            }
           
            exceptionOccurred = false;
        }
        finally
        {
            try
            {
                closeFileStream();
            }
            catch ( IOException ioe )
            {}

            deleteTempFile();

            if ( exceptionOccurred && ( document != null ) )
            {
                try
                {
                    document.close();
                }
                catch ( IOException ioe )
                {}
            }
        }
    }  

    protected File getPdfFile() {
      return this.pdfFile;
    }

    /**
     * Remove the temporary file.
     * A temporary file is created if this class is instantiated with an InputStream
     */
    protected void deleteTempFile() {
      if (isTmpPDFFile) {
        try {
          if (!pdfFile.delete()) LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted");
        } catch (SecurityException e) {
          LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted", e);
        }
      }
    }
    // ------------------------------------------------------------------------
    /**
     * Returns security handler of the document or <code>null</code> if document
     * is not encrypted or {@link #parse()} wasn't called before.
     *
     * @return the security handler.
     */
    public SecurityHandler getSecurityHandler()
    {
        return securityHandler;
    }

    // ------------------------------------------------------------------------
    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * Overwriting super method was necessary in order to set security handler.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    @Override
    public PDDocument getPDDocument() throws IOException
    {
        PDDocument pdDocument = super.getPDDocument();
        if ( securityHandler != null )
            pdDocument.setSecurityHandler( securityHandler );
        return pdDocument;
    }

    // ------------------------------------------------------------------------
    /**
     * Returns the number of pages in a document.
     *
     * @return the number of pages.
     *
     * @throws IOException  if PAGES or other needed object is missing
     */
    public int getPageNumber() throws IOException
    {
        int pageCount = getPagesObject().getInt( COSName.COUNT );
       
        if ( pageCount < 0 )
        {
            throw new IOException( "No page number specified." );
        }  
        return pageCount;
    }

    // ------------------------------------------------------------------------
    /**
     * Returns the page requested with all the objects loaded into it.
     *
     * @param pageNr starts from 0 to the number of pages.
     * @return the page with the given pagenumber.
     * @throws IOException If something went wrong.
     */
    public PDPage getPage( int pageNr ) throws IOException
    {
        getPagesObject();
       
        // ---- get list of top level pages
        COSArray kids = (COSArray) pagesDictionary.getDictionaryObject( COSName.KIDS );
       
        if ( kids == null )
        {
            throw new IOException( "Missing 'Kids' entry in pages dictionary." );
        }
           
        // ---- get page we are looking for (possibly going recursively into subpages)
        COSObject pageObj = getPageObject( pageNr, kids, 0 );
       
        if ( pageObj == null )
        {
            throw new IOException( "Page " + pageNr + " not found." );
        }
           
        // ---- parse all objects necessary to load page.
        COSDictionary pageDict = (COSDictionary) pageObj.getObject();
       
        if ( parseMinimalCatalog && ( ! allPagesParsed ) )
        {
            // parse page resources since we did not do this on start
            COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject( COSName.RESOURCES );
            parseDictObjects( resDict );
        }
       
        return new PDPage( pageDict );
    }

    /**
     * Returns the object for a specific page.
     * The page tree is made up of kids.  The kids have COSArray with COSObjects
     * inside of them. The COSObject can be parsed using the dynamic parsing method
     * We want to only parse the minimum COSObjects and still return a complete page.
     * ready to be used.
     *
     * @param num  the requested page number; numbering starts with 0
     * @param startKids Kids array to start with looking up page number
     * @param startPageCount
     *
     * @return  page object or <code>null</code> if no such page exists
     *
     * @throws IOException
     */
    private COSObject getPageObject( int num, COSArray startKids, int startPageCount ) throws IOException
    {
        int               curPageCount = startPageCount;
        Iterator<COSBase> kidsIter     = startKids.iterator();
       
        while( kidsIter.hasNext() )
        {
            COSObject obj  = (COSObject) kidsIter.next();
            COSBase   base = obj.getObject();
            if( base == null )
            {
                base = parseObjectDynamically( obj, false );
                obj.setObject( base );
            }
           
            COSDictionary dic   = (COSDictionary) base;
            int           count = dic.getInt( COSName.COUNT );
            if ( count >= 0 )
            {
                // skip this branch if requested page comes later
                if( ( curPageCount + count ) <= num )
                {
                    curPageCount += count;
                    continue;
                }
            }
                   
            COSArray kids = (COSArray) dic.getDictionaryObject( COSName.KIDS );
            if( kids != null)
            {
                // recursively scan subpages
                COSObject ans = getPageObject( num, kids, curPageCount );
                // if ans is not null, we got what we were looking for
                if( ans != null )
                {
                    return ans;
                }
            }
            else
            {
                // found page?
                if( curPageCount == num )
                {
                    return obj;
                }
                // page has no kids and it is not the page we are looking for
                curPageCount++;
            }
        }
        return null;
    }

    /** Creates a unique object id using object number and object generation number.
     *  (requires object number < 2^31)) */
    private final long getObjectId( final COSObject obj )
    {
        return ( obj.getObjectNumber().longValue() << 32 ) | obj.getGenerationNumber().longValue();
    }
   
    /** Adds all from newObjects to toBeParsedList if it is not an COSObject
     *  or we didn't add this COSObject already (checked via addedObjects). */
    private final void addNewToList( final Queue<COSBase> toBeParsedList,
                                         final Collection<COSBase> newObjects,
                                         final Set<Long> addedObjects )
    {
        for ( COSBase newObject : newObjects )
        {
            if ( newObject instanceof COSObject )
            {
                final long objId  = getObjectId( (COSObject) newObject );
                if ( ! addedObjects.add( objId ) )
                {
                    continue;
                }
            }
            toBeParsedList.add( newObject );
        }
    }

    /** Adds newObject to toBeParsedList if it is not an COSObject
     *  or we didn't add this COSObject already (checked via addedObjects). */
    private final void addNewToList( final Queue<COSBase> toBeParsedList,
                                         final COSBase newObject,
                                         final Set<Long> addedObjects )
    {
        if ( newObject instanceof COSObject )
        {
            final long objId  = getObjectId( (COSObject) newObject );
            if ( ! addedObjects.add( objId ) )
            {
                return;
            }
        }
        toBeParsedList.add( newObject );
    }

    /**
     * Will parse every object necessary to load a single page from the pdf document.
     * We try our best to order objects according to offset in file before reading
     * to minimize seek operations.
     *
     * @param dict the COSObject from the parent pages.
     * @param excludeObjects dictionary object reference entries with these names will not be parsed
     *
     * @throws IOException
     */
    private void parseDictObjects( COSDictionary dict, COSName... excludeObjects ) throws IOException
    {
        // ---- create queue for objects waiting for further parsing
        final Queue<COSBase>                toBeParsedList = new LinkedList<COSBase>();
        // offset ordered object map
        final TreeMap<Long,List<COSObject>> objToBeParsed  = new TreeMap<Long, List<COSObject>>()
        // in case of compressed objects offset points to stmObj
        final Set<Long>                     parsedObjects  = new HashSet<Long>();      
        final Set<Long>                     addedObjects   = new HashSet<Long>();      
       
        // ---- add objects not to be parsed to list of already parsed objects
        if ( excludeObjects != null )
        {
            for ( COSName objName : excludeObjects )
            {
                COSBase baseObj = dict.getItem( objName );
                if ( baseObj instanceof COSObject )
                {
                    parsedObjects.add( getObjectId( (COSObject) baseObj ) );
                }
            }
        }
           
        addNewToList( toBeParsedList, dict.getValues(), addedObjects );
       
        // ---- go through objects to be parsed
        while( ! ( toBeParsedList.isEmpty() && objToBeParsed.isEmpty() ) )
        {
            // -- first get all COSObject from other kind of objects and
            //    put them in objToBeParsed; afterwards toBeParsedList is empty
            COSBase baseObj;
            while ( ( baseObj = toBeParsedList.poll() ) != null )
            {
                if ( baseObj instanceof COSStream )
                {
                    addNewToList( toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects );
                }
                else if ( baseObj instanceof COSDictionary )
                {
                    addNewToList( toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects );
                }
                else if ( baseObj instanceof COSArray )
                {
                    final Iterator<COSBase> arrIter = ( (COSArray) baseObj ).iterator();
                    while ( arrIter.hasNext() )
                    {
                        addNewToList( toBeParsedList, arrIter.next(), addedObjects );
                    }
                }
                else if ( baseObj instanceof COSObject )
                {
                    COSObject    obj    = (COSObject) baseObj;
                    long         objId  = getObjectId( obj );
                    COSObjectKey objKey = new COSObjectKey( obj.getObjectNumber().intValue(),
                            obj.getGenerationNumber().intValue() );
                                   
                    if ( ! ( parsedObjects.contains( objId ) /*|| document.hasObjectInPool( objKey ) */ ) )
                    {
                        Long fileOffset = xrefTrailerResolver.getXrefTable().get( objKey );
                        //  it is allowed that object references point to null, thus we have to test
                        if ( fileOffset != null
                        {
                            if ( fileOffset > 0 )
                            {
                                objToBeParsed.put( fileOffset, Collections.singletonList( obj ) );
                            }
                            else
                            {
                                // negative offset means we have a compressed object within object stream;
                                // get offset of object stream
                                fileOffset = xrefTrailerResolver.getXrefTable().get( new COSObjectKey( -fileOffset, 0 ) );
                                if ( ( fileOffset == null ) || ( fileOffset <= 0 ) )
                                {
                                    throw new IOException( "Invalid object stream xref object reference: " + fileOffset );
                                }
                               
                                List<COSObject> stmObjects = objToBeParsed.get( fileOffset );
                                if ( stmObjects == null )
                                {
                                    objToBeParsed.put( fileOffset, stmObjects = new ArrayList<COSObject>() );
                                }
                                stmObjects.add( obj );
                            }
                        }
                        else
                        {
                            // NULL object
                            COSObject pdfObject = document.getObjectFromPool( objKey );
                            pdfObject.setObject( COSNull.NULL );
                        }
                    }
                }
            }
           
            // ---- read first COSObject with smallest offset;
            //      resulting object will be added to toBeParsedList
            if ( objToBeParsed.isEmpty() )
            {
                break;
            }
       
            for ( COSObject obj : objToBeParsed.remove( objToBeParsed.firstKey() ) )
            {
                COSBase  parsedObj = parseObjectDynamically( obj, false );
               
                obj.setObject( parsedObj );
                addNewToList( toBeParsedList, parsedObj, addedObjects );
               
                parsedObjects.add( getObjectId( obj ) );
            }
        }
    }
   
    /**
     * This will parse the next object from the stream and add it to
     * the local state.
     * This is taken from {@link PDFParser} and reduced to parsing
     * an indirect object.
     *
     * @param  obj object to be parsed (we only take object number and generation number for lookup start offset)
     * @param  requireExistingNotCompressedObj  if <code>true</code> object to be parsed must
     *          not be contained within compressed stream
     * @return  the parsed object (which is also added to document object)
     *
     * @throws IOException If an IO error occurs.
     */
    protected final COSBase parseObjectDynamically( COSObject obj, boolean requireExistingNotCompressedObj )
    throws IOException
    {
        return parseObjectDynamically( obj.getObjectNumber().intValue(),
                obj.getGenerationNumber().intValue(),
                requireExistingNotCompressedObj );
    }

    /**
     * This will parse the next object from the stream and add it to
     * the local state.
     * This is taken from {@link PDFParser} and reduced to parsing
     * an indirect object.
     *
     * @param  objNr object number of object to be parsed
     * @param  objGenNr object generation number of object to be parsed
     * @param requireExistingNotCompressedObj  if <code>true</code> the object to be parsed must be defined
     *                                          in xref (comment: null objects may be missing from xref) and
     *                                          it must not be a compressed object within object stream
     *                                          (this is used to circumvent being stuck in a loop in a malicious PDF)
     *
     * @return  the parsed object (which is also added to document object)
     *
     * @throws IOException If an IO error occurs.
     */
    protected COSBase parseObjectDynamically( int objNr, int objGenNr, boolean requireExistingNotCompressedObj )
    throws IOException
    {
        // ---- create object key and get object (container) from pool
        final COSObjectKey objKey    = new COSObjectKey( objNr, objGenNr );
        final COSObject    pdfObject = document.getObjectFromPool( objKey );
           
        if ( pdfObject.getObject() == null )
        {
            // not previously parsed
            // ---- read offset or object stream object number from xref table
            Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get( objKey );

            // sanity test to circumvent loops with broken documents
            if ( requireExistingNotCompressedObj &&
                    ( ( offsetOrObjstmObNr == null ) || ( offsetOrObjstmObNr <= 0 ) ) )
            {  
                throw new IOException( "Object must be defined and must not be compressed object: " +
                        objKey.getNumber() + ":" + objKey.getGeneration() );
            }
       
            if ( offsetOrObjstmObNr == null )
            {
                // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
                pdfObject.setObject( COSNull.NULL );
            }
            else if ( offsetOrObjstmObNr > 0 )
            {
                // offset of indirect object in file
                // ---- go to object start
                setPdfSource( offsetOrObjstmObNr );
               
                // ---- we must have an indirect object
                final int readObjNr  = readInt();
                final int readObjGen = readInt();
                readPattern( OBJ_MARKER );
               
                // ---- consistency check
                if ( ( readObjNr != objKey.getNumber() ) ||
                        ( readObjGen != objKey.getGeneration() ) )
                {
                    throw new IOException( "XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() +
                            " points to wrong object: " + readObjNr + ":" + readObjGen );
                }
               
                skipSpaces();
                COSBase pb           = parseDirObject();
                String  endObjectKey = readString();
               
                if ( endObjectKey.equals( "stream" ) )
                {
                    pdfSource.unread( endObjectKey.getBytes("ISO-8859-1") );
                    pdfSource.unread( ' ' );
                    if( pb instanceof COSDictionary )
                    {
                        COSStream stream = parseCOSStream( (COSDictionary)pb,
                                getDocument().getScratchFile() );
                                    
                        if ( securityHandler != null )
                        {
                            try
                            {
                                securityHandler.decryptStream(stream, objNr, objGenNr );
                            }
                            catch ( CryptographyException ce )
                            {
                                throw new IOException( "Error decrypting stream object " + objNr + ": " + ce.getMessage()
                                /*, ce // TODO: remove remark with Java 1.6 */ );
                            }
                        }
                        pb = stream;
                    }
                    else
                    {
                        // this is not legal
                        // the combination of a dict and the stream/endstream forms a complete stream object
                        throw new IOException( "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")." );
                    }
                    skipSpaces();
                    endObjectKey = readLine();
                   
                    // we have case with a second 'endstream' before endobj
                    if ( ! endObjectKey.startsWith( "endobj" ) )
                    {
                        if ( endObjectKey.startsWith( "endstream" ) )
                        {
                            endObjectKey = endObjectKey.substring( 9 ).trim();
                            if ( endObjectKey.length() == 0 )
                            {
                                // no other characters in extra endstream line
                                endObjectKey = readLine();    // read next line
                            }
                        }
                    }
                }
                else if ( securityHandler != null )
                {
                    // decrypt
                    if ( pb instanceof COSString )
                    {
                        decrypt( (COSString) pb, objNr, objGenNr );
                    }
                    else if ( pb instanceof COSDictionary )
                    {
                        for( Entry<COSName,COSBase> entry : ((COSDictionary) pb).entrySet() )
                        {
                            // TODO: specially handle 'Contents' entry of signature dictionary like in SecurityHandler#decryptDictionary
                            if ( entry.getValue() instanceof COSString )
                            {
                                decrypt( (COSString) entry.getValue(), objNr, objGenNr );
                            }
                        }
                    }
                    else if ( pb instanceof COSArray )
                    {
                        final COSArray array = (COSArray) pb;
                        for( int aIdx = 0, len = array.size(); aIdx < len; aIdx++ )
                        {
                            if ( array.get( aIdx ) instanceof COSString )
                            {
                                decrypt( (COSString) array.get( aIdx ), objNr, objGenNr );
                            }
                        }
                    }
                }
                     
                pdfObject.setObject( pb );
               
                if ( ! endObjectKey.startsWith( "endobj" ) )
                {
                    throw new IOException( "Object (" + readObjNr + ":" + readObjGen +
                            ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'." );
                }
               
                releasePdfSourceInputStream();
                       
            }
            else
            {
                // xref value is object nr of object stream containing object to be parsed;
                // since our object was not found it means object stream was not parsed so far
                final int     objstmObjNr   = (int) ( - offsetOrObjstmObNr );
                final COSBase objstmBaseObj = parseObjectDynamically( objstmObjNr, 0, true );
                if ( objstmBaseObj instanceof COSStream )
                {
                    // parse object stream
                    PDFObjectStreamParser parser =
                        new PDFObjectStreamParser( (COSStream) objstmBaseObj, document, forceParsing );
                    parser.parse();
                   
                    // get set of object numbers referenced for this object stream
                    final Set<Long> refObjNrs = xrefTrailerResolver.getContainedObjectNumbers( objstmObjNr );
                   
                    // register all objects which are referenced to be contained in object stream
                    for( COSObject next : parser.getObjects() )
                    {
                        COSObjectKey stmObjKey = new COSObjectKey( next );
                        if ( refObjNrs.contains( stmObjKey.getNumber() ) )
                        {
                            COSObject stmObj = document.getObjectFromPool( stmObjKey );
                            stmObj.setObject( next.getObject() );
                        }
                    }
                }
            }
        }  
        return pdfObject.getObject();
    }
   
    // ------------------------------------------------------------------------
    /** Decrypts given COSString. */
    protected final void decrypt( COSString str, long objNr, long objGenNr )
    throws IOException
    {
        try
        {
            securityHandler.decryptString( str, objNr, objGenNr );
        }
        catch ( CryptographyException ce )
        {
            throw new IOException( "Error decrypting string: " + ce.getMessage()
            /*, ce // TODO: remove remark with Java 1.6 */ );
        }  
    }
 
    // ------------------------------------------------------------------------
    private boolean inGetLength = false;
   
    /** Returns length value referred to or defined in given object. */
    private COSNumber getLength( final COSBase lengthBaseObj ) throws IOException
    {
        if ( lengthBaseObj == null )
        {
            return null;
        }
       
        if ( inGetLength )
        {
            throw new IOException( "Loop while reading length from " + lengthBaseObj );
        }
       
        COSNumber retVal = null;
       
        try
        {
            inGetLength = true;
           
            // ---- maybe length was given directly
            if ( lengthBaseObj instanceof COSNumber )
            {
                retVal = (COSNumber) lengthBaseObj;
            }
            // ---- length in referenced object
            else if ( lengthBaseObj instanceof COSObject )
            {
                COSObject lengthObj = (COSObject) lengthBaseObj;
               
                if ( lengthObj.getObject() == null )
                {
                    // not read so far
                   
                    // keep current stream position
                    final long curFileOffset = getPdfSourceOffset();
                    releasePdfSourceInputStream();
                   
                    parseObjectDynamically( lengthObj, true );
                   
                    // reset current stream position
                    setPdfSource( curFileOffset );
                   
                    if ( lengthObj.getObject() == null )
                    {
                        throw new IOException( "Length object content was not read." );
                    }
                }
               
                if ( ! ( lengthObj.getObject() instanceof COSNumber ) )
                {
                    throw new IOException( "Wrong type of referenced length object " + lengthObj + ": " +
                            lengthObj.getObject().getClass().getSimpleName() );
                }
                           
                retVal = (COSNumber) lengthObj.getObject();
               
            }
            else
            {
                throw new IOException( "Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName() );
            }
        }
        finally
        {
            inGetLength = false;
        }
        return retVal;
    }
 
    // ------------------------------------------------------------------------
    private final int    streamCopyBufLen = 8192;
    private final byte[] streamCopyBuf    = new byte[ streamCopyBufLen ];
   
    /**
     * This will read a COSStream from the input stream using length attribute
     * within dictionary.
     * If length attribute is a indirect reference it is first resolved to get
     * the stream length. This means we copy stream data without testing for
     * 'endstream' or 'endobj' and thus it is no problem if these keywords
     * occur within stream.
     * We require 'endstream' to be found after stream data is read.
     *
     * @param dic  dictionary that goes with this stream.
     * @param file  file to write the stream to when reading.
     *
     * @return parsed pdf stream.
     *
     * @throws IOException if an error occurred reading the stream, like problems
     *         with reading length attribute, stream does not end with 'endstream'
     *         after data read, stream too short etc.
     */
    @Override
    protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException
    {
        final COSStream stream = new COSStream( dic, file );
        OutputStream out = null;
        try
        {
            readString();    // read 'stream'; this was already tested in parseObjectsDynamically()
           
            // ---- skip whitespaces before start of data
            //      PDF Ref 1.7, chap. 3.2.7:
            //      'stream' should be followed by either a CRLF (0x0d 0x0a) or LF but nothing else.
            {
                int whitespace = pdfSource.read();
               
                //see brother_scan_cover.pdf, it adds whitespaces
                //after the stream but before the start of the
                //data, so just read those first
                while (whitespace == 0x20)
                {
                    whitespace = pdfSource.read();
                }
               
                if( whitespace == 0x0D )
                {
                    whitespace = pdfSource.read();
                    if( whitespace != 0x0A )
                    {
                        // the spec says this is invalid but it happens in the real
                        // world so we must support it
                        pdfSource.unread( whitespace );
                    }
                }
                else if (whitespace != 0x0A)
                {
                    // no whitespace after 'stream'; PDF ref. says 'should' so that is ok
                    pdfSource.unread( whitespace );
                }
            }
           
            /*This needs to be dic.getItem because when we are parsing, the underlying object
             * might still be null.
             */
            COSNumber streamLengthObj = getLength( dic.getItem( COSName.LENGTH ) );
            if ( streamLengthObj == null )
            {
                      throw new IOException( "Missing length for stream." );
            }
             
            // ---- get output stream to copy data to
            out = stream.createFilteredStream( streamLengthObj );
           
            long remainBytes = streamLengthObj.longValue();
           
            while ( remainBytes > 0 )
            {
                final int readBytes = pdfSource.read( streamCopyBuf, 0,
                        ( remainBytes > streamCopyBufLen ) ? streamCopyBufLen : (int) remainBytes );
                if ( readBytes <= 0 )
                {
                    throw new IOException( "No more bytes from stream but expected: " + remainBytes );
                }    
                out.write( streamCopyBuf, 0, readBytes );
               
                remainBytes -= readBytes;
            }
           
            String endStream = readString();
           
            if ( ! endStream.equals( "endstream" ) )
            {
                throw new IOException( "Error reading stream using length value. Expected='endstream' actual='"
                        + endStream + "' " );
            }
           
        }
        finally
        {
            if ( out != null )
            {
                out.close();
            }
        }
        return stream;
    }
}
TOP

Related Classes of org.apache.pdfbox.pdfparser.NonSequentialPDFParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.