Source Code of org.apache.pdfbox.pdfparser.NonSequentialPDFParser

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;


import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.KeyStore;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.persistence.util.COSObjectKey;


/**
 * PDFParser which first reads startxref and xref tables in order to know valid objects and parse only these objects.
 * Thus it is closer to a conforming parser than the sequential reading of {@link PDFParser}.
 * 
 * This class can be used as a {@link PDFParser} replacement. First {@link #parse()} must be called before page objects
 * can be retrieved, e.g. {@link #getPDDocument()}.
 * 
 * This class is a much enhanced version of <code>QuickParser</code> presented in <a
 * href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a> by Jeremy Villalobos.
 */
public class NonSequentialPDFParser extends PDFParser
{
    private static final byte[] XREF_TABLE = new byte[] { 'x', 'r', 'e', 'f' };
    private static final byte[] XREF_STREAM = new byte[] { '/','X', 'R', 'e', 'f' };
    private static final long MINIMUM_SEARCH_OFFSET = 6;
    
    private static final int X = 'x';


    /**
     * Only parse the PDF file minimally allowing access to basic information.
     */
    public static final String SYSPROP_PARSEMINIMAL = 
            "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
    
    /**
     * The range within the %%EOF marker will be searched.
     * Useful if there are additional characters after %%EOF within the PDF. 
     */
    public static final String SYSPROP_EOFLOOKUPRANGE =
            "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";


    private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream(new byte[0]);


    /**
     * How many trailing bytes to read for EOF marker.
     */
    protected static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
    /**
     * EOF-marker.
     */
    protected static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
    /**
     * StartXRef-marker.
     */
    protected static final char[] STARTXREF_MARKER = new char[] { 's', 't', 'a', 'r', 't', 'x',
            'r', 'e', 'f' };
    /**
     * obj-marker.
     */
    protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };


    /**
     * trailer-marker.
     */
    private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
    
    private long trailerOffset;
    private final File pdfFile;
    private long fileLen;
    private final RandomAccessBufferedFileInputStream raStream;


    /**
     * is parser using auto healing capacity ?
     */
    private boolean isLenient = true;


    /**
     * Contains all found objects of a brute force search.
     */
    private HashMap<String, Long> bfSearchObjectOffsets = null;
    private HashMap<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
    private Vector<Long> bfSearchXRefOffsets = null;


    /**
     * The security handler.
     */
    protected SecurityHandler securityHandler = null;


    private AccessPermission accessPermission;
    private final String keyStoreFilename = null;
    private final String alias = null;
    private String password = "";
    private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing
                                                          // bytes to read for
                                                          // EOF marker


    /**
     * If <code>true</code> object references in catalog are not followed; pro: page objects will be only parsed when
     * needed; cons: some information of catalog might not be available (e.g. outline). Catalog parsing without pages is
     * not an option since a number of entries will also refer to page objects (like OpenAction).
     */
    private final boolean parseMinimalCatalog = "true".equals(System.getProperty(SYSPROP_PARSEMINIMAL));


    private boolean initialParseDone = false;
    private boolean allPagesParsed = false;


    private static final Log LOG = LogFactory.getLog(NonSequentialPDFParser.class);


    /**
     * <code>true</code> if the NonSequentialPDFParser is initialized by a InputStream, in this case a temporary file is
     * created. At the end of the {@linkplain #parse()} method,the temporary file will be deleted.
     */
    private boolean isTmpPDFFile = false;


    /**
     * The prefix for the temp file being used. 
     */
    public static final String TMP_FILE_PREFIX = "tmpPDF";


    // ------------------------------------------------------------------------
    /**
     * Constructs parser for given file using memory buffer.
     * 
     * @param filename the filename of the pdf to be parsed
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(String filename) throws IOException
    {
        this(new File(filename), null, false);
    }


    /**
     * Constructs parser for given file using memory buffer.
     * 
     * @param filename the filename of the pdf to be parsed.
     * @param useScratchFiles use a buffer for temporary storage.
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(String filename, boolean useScratchFiles) throws IOException
    {
        this(new File(filename), null, useScratchFiles);
    }


    /**
     * Constructs parser for given file using given buffer for temporary
     * storage.
     * 
     * @param file the pdf to be parsed
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file) throws IOException
    {
        this(file, "", false);
    }


    /**
     * Constructs parser for given file using given buffer for temporary
     * storage.
     * 
     * @param file the pdf to be parsed
     * @param useScratchFiles use a buffer for temporary storage.
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, boolean useScratchFiles) throws IOException
    {
        this(file, "", useScratchFiles);
    }


    /**
     * Constructs parser for given file using given buffer for temporary storage.
     * 
     * @param file the pdf to be parsed
     * @param decryptionPassword password to be used for decryption
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, String decryptionPassword)
            throws IOException
    {
        this (file, decryptionPassword, false);
    }


    /**
     * Constructs parser for given file using given buffer for temporary storage.
     * 
     * @param file the pdf to be parsed.
     * @param decryptionPassword password to be used for decryption.
     * @param useScratchFiles use a buffer for temporary storage.
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, String decryptionPassword, boolean useScratchFiles)
            throws IOException
    {
        super(EMPTY_INPUT_STREAM, false);
        pdfFile = file;
        raStream = new RandomAccessBufferedFileInputStream(pdfFile);
        init(file, decryptionPassword, useScratchFiles);
    }


    private void init(File file, String decryptionPassword, boolean useScratchFiles) throws IOException
    {
        String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);
        if (eofLookupRangeStr != null)
        {
            try
            {
                setEOFLookupRange(Integer.parseInt(eofLookupRangeStr));
            }
            catch (NumberFormatException nfe)
            {
                LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE
                        + " does not contain an integer value, but: '" + eofLookupRangeStr + "'");
            }
        }
        setDocument(new COSDocument(false, useScratchFiles));
        pdfSource = new PushBackInputStream(raStream, 4096);
        password = decryptionPassword;
    }


    /**
     * Constructor.
     * 
     * @param input input stream representing the pdf.
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(InputStream input) throws IOException
    {
        this(input, "", false);
    }


    /**
     * Constructor.
     * 
     * @param input input stream representing the pdf.
     * @param useScratchFiles use a buffer for temporary storage.
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(InputStream input, boolean useScratchFiles) throws IOException
    {
        this(input, "", useScratchFiles);
    }


    /**
     * Constructor.
     * 
     * @param input input stream representing the pdf.
     * @param decryptionPassword password to be used for decryption.
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(InputStream input, String decryptionPassword)
            throws IOException
    {
        this(input, decryptionPassword, false);
    }


    /**
     * Constructor.
     * 
     * @param input input stream representing the pdf.
     * @param decryptionPassword password to be used for decryption.
     * @param useScratchFiles use a buffer for temporary storage.
     *
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(InputStream input, String decryptionPassword, boolean useScratchFiles)
            throws IOException
    {
        super(EMPTY_INPUT_STREAM, false);
        pdfFile = createTmpFile(input);
        raStream = new RandomAccessBufferedFileInputStream(pdfFile);
        init(pdfFile, decryptionPassword, useScratchFiles);
    }


    /**
     * Create a temporary file with the input stream. If the creation succeed, the {@linkplain #isTmpPDFFile} is set to
     * true. This Temporary file will be deleted at end of the parse method
     * 
     * @param input
     * @return the temporary file
     * @throws IOException If something went wrong.
     */
    private File createTmpFile(InputStream input) throws IOException
    {
        FileOutputStream fos = null;
        try
        {
            File tmpFile = File.createTempFile(TMP_FILE_PREFIX, ".pdf");
            fos = new FileOutputStream(tmpFile);
            IOUtils.copy(input, fos);
            isTmpPDFFile = true;
            return tmpFile;
        }
        finally
        {
            IOUtils.closeQuietly(input);
            IOUtils.closeQuietly(fos);
        }
    }


    @Override
    public PDDocument getPDDocument() throws IOException
    {
        return new PDDocument( getDocument(), this, accessPermission );
    }




    // ------------------------------------------------------------------------
    /**
     * Sets how many trailing bytes of PDF file are searched for EOF marker and 'startxref' marker. If not set we use
     * default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
     * 
     * <p<We check that new value is at least 16. However for practical use cases this value should not be lower than
     * 1000; even 2000 was found to not be enough in some cases where some trailing garbage like HTML snippets followed
     * the EOF marker.</p>
     * 
     * <p>
     * In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined this value will be set on initialization but
     * can be overwritten later.
     * </p>
     * 
     * @param byteCount number of trailing bytes
     */
    public void setEOFLookupRange(int byteCount)
    {
        if (byteCount > 15)
        {
            readTrailBytes = byteCount;
        }
    }


    /**
     * The initial parse will first parse only the trailer, the xrefstart and all xref tables to have a pointer (offset)
     * to all the pdf's objects. It can handle linearized pdfs, which will have an xref at the end pointing to an xref
     * at the beginning of the file. Last the root object is parsed.
     * 
     * @throws IOException If something went wrong.
     */
    protected void initialParse() throws IOException
    {
        COSDictionary trailer = null;
        // ---- parse startxref
        long startXRefOffset = getStartxrefOffset();
        if (startXRefOffset > 0)
        {
            trailer = parseXref(startXRefOffset);
        }
        else if (isFDFDocment || isLenient)
        {
            // signal start of new XRef
            xrefTrailerResolver.nextXrefObj( startXRefOffset, XRefType.TABLE );
            bfSearchForObjects();
            for (COSObjectKey objectKey : bfSearchCOSObjectKeyOffsets.keySet())
            {
                xrefTrailerResolver.setXRef(objectKey, bfSearchCOSObjectKeyOffsets.get(objectKey));
            }
            // parse the last trailer.
            pdfSource.seek(trailerOffset);
            if (!parseTrailer())
            {
                throw new IOException("Expected trailer object at position: "
                        + pdfSource.getOffset());
            }
            xrefTrailerResolver.setStartxref(startXRefOffset);
            trailer = xrefTrailerResolver.getCurrentTrailer();
            document.setTrailer(trailer);
            document.setIsXRefStream(false);
        }
        // ---- prepare decryption if necessary
        prepareDecryption();


        // PDFBOX-1557 - ensure that all COSObject are loaded in the trailer
        // PDFBOX-1606 - after securityHandler has been instantiated
        for (COSBase trailerEntry : trailer.getValues())
        {
            if (trailerEntry instanceof COSObject)
            {
                COSObject tmpObj = (COSObject) trailerEntry;
                parseObjectDynamically(tmpObj, false);
            }
        }
        // ---- parse catalog or root object
        COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT);


        if (root == null)
        {
            throw new IOException("Missing root object specification in trailer.");
        }


        COSBase rootObject = parseObjectDynamically(root, false);


        // ---- resolve all objects
        if (isFDFDocment)
        {
            // A FDF doesn't have a catalog, all FDF fields are within the root object
            if (rootObject instanceof COSDictionary)
            {
                parseDictObjects((COSDictionary) rootObject, (COSName[]) null);
                allPagesParsed = true;
                document.setDecrypted();
            }
        }
        else if(!parseMinimalCatalog)
        {
            COSObject catalogObj = document.getCatalog();
            if (catalogObj != null)
            {
                if (catalogObj.getObject() instanceof COSDictionary)
                {
                    parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null);
                    allPagesParsed = true;
                    document.setDecrypted();
                }
            }
        }


        // PDFBOX-1922: read the version again now that all objects have been resolved
        readVersionInTrailer(trailer);


        initialParseDone = true;
    }


    /**
     * Resolves all not already parsed objects of a dictionary recursively.
     * 
     * @param dictionaryObject dictionary to be parsed
     * @throws IOException if something went wrong
     * 
     */
    private void parseDictionaryRecursive(COSObject dictionaryObject) throws IOException
    {
        parseObjectDynamically(dictionaryObject, true);
        COSDictionary dictionary = (COSDictionary)dictionaryObject.getObject();
        for(COSBase value : dictionary.getValues())
        {
            if (value instanceof COSObject)
            {
                COSObject object = (COSObject)value;
                if (object.getObject() == null)
                {
                    parseDictionaryRecursive(object);
                }
            }
        }
    }
    /**
     * Prepare for decryption.
     * 
     * @throws IOException if something went wrong
     */
    private void prepareDecryption() throws IOException
    {
        COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT);
        if (trailerEncryptItem != null && !(trailerEncryptItem instanceof COSNull))
        {
            if (trailerEncryptItem instanceof COSObject)
            {
                COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
                parseDictionaryRecursive(trailerEncryptObj);
            }
            try
            {
                PDEncryption encryption = new PDEncryption(document.getEncryptionDictionary());


                DecryptionMaterial decryptionMaterial;
                if (keyStoreFilename != null)
                {
                    KeyStore ks = KeyStore.getInstance("PKCS12");
                    ks.load(new FileInputStream(keyStoreFilename), password.toCharArray());


                    decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password);
                }
                else
                {
                    decryptionMaterial = new StandardDecryptionMaterial(password);
                }


                securityHandler = encryption.getSecurityHandler();
                securityHandler.prepareForDecryption(encryption, document.getDocumentID(),
                        decryptionMaterial);
                accessPermission = securityHandler.getCurrentAccessPermission();
            }
            catch (Exception e)
            {
                if (e instanceof IOException)
                {
                    throw (IOException) e;
                }
                throw new IOException("Error (" + e.getClass().getSimpleName()
                        + ") while creating security handler for decryption",e);
            }
        }
    }
    
    /**
     * Parses cross reference tables.
     * 
     * @param startXRefOffset start offset of the first table
     * @return the trailer dictionary
     * @throws IOException if something went wrong
     */
    private COSDictionary parseXref(long startXRefOffset) throws IOException
    {
        setPdfSource(startXRefOffset);
        parseStartXref();


        long startXrefOffset = document.getStartXref();
        // check the startxref offset
        long fixedOffset = checkXRefOffset(startXrefOffset);
        if (fixedOffset > -1)
        {
            startXrefOffset = fixedOffset;
            document.setStartXref(startXrefOffset);
        }
        long prev = startXrefOffset;
        // ---- parse whole chain of xref tables/object streams using PREV
        // reference
        while (prev > -1)
        {
            // seek to xref table
            setPdfSource(prev);


            // skip white spaces
            skipSpaces();
            // -- parse xref
            if (pdfSource.peek() == X)
            {
                // xref table and trailer
                // use existing parser to parse xref table
                parseXrefTable(prev);
                // parse the last trailer.
                trailerOffset = pdfSource.getOffset();
                // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
                while (isLenient && pdfSource.peek() != 't')
                {
                    if (pdfSource.getOffset() == trailerOffset)
                    {
                        // warn only the first time
                        LOG.warn("Expected trailer object at position " + trailerOffset
                                + ", keep trying");
                    }
                    readLine();
                }
                if (!parseTrailer())
                {
                    throw new IOException("Expected trailer object at position: "
                            + pdfSource.getOffset());
                }
                COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                // check for a XRef stream, it may contain some object ids of compressed objects 
                if(trailer.containsKey(COSName.XREF_STM))
                {
                    int streamOffset = trailer.getInt(COSName.XREF_STM);
                    // check the xref stream reference
                    fixedOffset = checkXRefOffset(streamOffset);
                    if (fixedOffset > -1 && fixedOffset != streamOffset)
                    {
                        streamOffset = (int)fixedOffset;
                        trailer.setInt(COSName.XREF_STM, streamOffset);
                    }
                    setPdfSource(streamOffset);
                    skipSpaces();
                    parseXrefObjStream(prev, false); 
                }
                prev = trailer.getInt(COSName.PREV);
                if (prev > -1)
                {
                    // check the xref table reference
                    fixedOffset = checkXRefOffset(prev);
                    if (fixedOffset > -1 && fixedOffset != prev)
                    {
                        prev = fixedOffset;
                        trailer.setLong(COSName.PREV, prev);
                    }
                }
            }
            else
            {
                // parse xref stream
                prev = parseXrefObjStream(prev, true);
                if (prev > -1)
                {
                    // check the xref table reference
                    fixedOffset = checkXRefOffset(prev);
                    if (fixedOffset > -1 && fixedOffset != prev)
                    {
                        prev = fixedOffset;
                        COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                        trailer.setLong(COSName.PREV, prev);
                    }
                }
            }
        }
        // ---- build valid xrefs out of the xref chain
        xrefTrailerResolver.setStartxref(startXrefOffset);
        COSDictionary trailer = xrefTrailerResolver.getTrailer();
        document.setTrailer(trailer);
        document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
        // check the offsets of all referenced objects
        checkXrefOffsets();
        return trailer;
    }


    /**
     * Parses an xref object stream starting with indirect object id.
     * 
     * @return value of PREV item in dictionary or <code>-1</code> if no such item exists
     */
    private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException
    {
        // ---- parse indirect object head
        readObjectNumber();
        readGenerationNumber();
        readPattern(OBJ_MARKER);


        COSDictionary dict = parseCOSDictionary();
        COSStream xrefStream = parseCOSStream(dict);
        parseXrefStream(xrefStream, (int) objByteOffset, isStandalone);


        return dict.getLong(COSName.PREV);
    }


    // ------------------------------------------------------------------------
    /** Get current offset in file at which next byte would be read. */
    private long getPdfSourceOffset()
    {
        return pdfSource.getOffset();
    }


    /**
     * Sets {@link #pdfSource} to start next parsing at given file offset.
     * 
     * @param fileOffset file offset
     * @throws IOException If something went wrong.
     */
    protected final void setPdfSource(long fileOffset) throws IOException
    {


        pdfSource.seek(fileOffset);


        // alternative using 'old fashioned' input stream
        // if ( pdfSource != null )
        // pdfSource.close();
        //
        // pdfSource = new PushBackInputStream(
        // new BufferedInputStream(
        // new FileInputStream( file ), 16384), 4096);
        // pdfSource.skip( _fileOffset );
    }


    /**
     * Enable handling of alternative pdfSource implementation.
     * 
     * @throws IOException If something went wrong.
     */
    protected final void releasePdfSourceInputStream() throws IOException
    {
        // if ( pdfSource != null )
        // pdfSource.close();
    }


    private void closeFileStream() throws IOException
    {
        if (pdfSource != null)
        {
            pdfSource.close();
        }
    }


    // ------------------------------------------------------------------------
    /**
     * Looks for and parses startxref. We first look for last '%%EOF' marker (within last
     * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find
     * <code>startxref</code>.
     * 
     * @return the offset of StartXref
     * @throws IOException If something went wrong.
     */
    protected final long getStartxrefOffset() throws IOException
    {
        byte[] buf;
        long skipBytes;


        // ---- read trailing bytes into buffer
        fileLen = pdfFile.length();


        FileInputStream fIn = null;
        try
        {
            fIn = new FileInputStream(pdfFile);


            final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
            buf = new byte[trailByteCount];
            fIn.skip(skipBytes = fileLen - trailByteCount);


            int off = 0;
            int readBytes;
            while (off < trailByteCount)
            {
                readBytes = fIn.read(buf, off, trailByteCount - off);
                // in order to not get stuck in a loop we check readBytes (this
                // should never happen)
                if (readBytes < 1)
                {
                    throw new IOException(
                            "No more bytes to read for trailing buffer, but expected: "
                                    + (trailByteCount - off));
                }
                off += readBytes;
            }
        }
        finally
        {
            if (fIn != null)
            {
                try
                {
                    fIn.close();
                }
                catch (IOException ioe)
                {
                }
            }
        }


        // ---- find last '%%EOF'
        int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);


        if (bufOff < 0)
        {
            if (isLenient) 
            {
                // in lenient mode the '%%EOF' isn't needed
                bufOff = buf.length;
                LOG.debug("Missing end of file marker '" + (new String(EOF_MARKER)) + "'");
            } 
            else 
            {
                throw new IOException("Missing end of file marker '" + (new String(EOF_MARKER)) + "'");
            }
        }
        // ---- find last startxref preceding EOF marker
        bufOff = lastIndexOf(STARTXREF_MARKER, buf, bufOff);


        if (bufOff < 0)
        {
            if (isLenient) 
            {
                trailerOffset = lastIndexOf(TRAILER_MARKER, buf, buf.length);
                if (trailerOffset > 0)
                {
                    trailerOffset += skipBytes;
                }
                return -1;
            }
            else
            {
                throw new IOException("Missing 'startxref' marker.");
            }
        }
        return skipBytes + bufOff;
    }


    // ------------------------------------------------------------------------
    /**
     * Searches last appearance of pattern within buffer. Lookup before _lastOff and goes back until 0.
     * 
     * @param pattern pattern to search for
     * @param buf buffer to search pattern in
     * @param endOff offset (exclusive) where lookup starts at
     * 
     * @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found
     */
    protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff)
    {
        final int lastPatternChOff = pattern.length - 1;


        int bufOff = endOff;
        int patOff = lastPatternChOff;
        char lookupCh = pattern[patOff];


        while (--bufOff >= 0)
        {
            if (buf[bufOff] == lookupCh)
            {
                if (--patOff < 0)
                {
                    // whole pattern matched
                    return bufOff;
                }
                // matched current char, advance to preceding one
                lookupCh = pattern[patOff];
            }
            else if (patOff < lastPatternChOff)
            {
                // no char match but already matched some chars; reset
                lookupCh = pattern[patOff = lastPatternChOff];
            }
        }


        return -1;
    }


    // ------------------------------------------------------------------------
    /**
     * Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end.
     * 
     * @param pattern pattern to be skipped
     * @throws IOException if pattern could not be read
     */
    protected final void readPattern(final char[] pattern) throws IOException
    {
        skipSpaces();


        for (char c : pattern)
        {
            if (pdfSource.read() != c)
            {
                throw new IOException("Expected pattern '" + new String(pattern)
                        + "' but missed at character '" + c + "' at offset "
                        + pdfSource.getOffset());
            }
        }


        skipSpaces();
    }


    // ------------------------------------------------------------------------
    private COSDictionary pagesDictionary = null;


    /**
     * Returns PAGES {@link COSDictionary} object or throws {@link IOException} if PAGES dictionary does not exist.
     */
    private COSDictionary getPagesObject() throws IOException
    {
        if (pagesDictionary != null)
        {
            return pagesDictionary;
        }
        COSObject pages = (COSObject) document.getCatalog().getItem(COSName.PAGES);


        if (pages == null)
        {
            throw new IOException("Missing PAGES entry in document catalog.");
        }


        COSBase object = parseObjectDynamically(pages, false);


        if (!(object instanceof COSDictionary))
        {
            throw new IOException("PAGES not a dictionary object, but: "
                    + object.getClass().getSimpleName());
        }


        pagesDictionary = (COSDictionary) object;


        return pagesDictionary;
    }


    // ------------------------------------------------------------------------
    /** Parses all objects needed by pages and closes input stream. */
    /**
     * {@inheritDoc}
     */
    @Override
    public void parse() throws IOException
    {
        boolean exceptionOccurred = true; // set to false if all is processed


        try
        {
            // PDFBOX-1922 read the version header and rewind
            // this part copied from the sequential parser
            parseHeader();
            pdfSource.seek(0);


            if (!initialParseDone)
            {
                initialParse();
            }


            // a FDF doesn't have any pages
            if (!isFDFDocment)
            {
                final int pageCount = getPageNumber();


                if (!allPagesParsed)
                {
                    for (int pNr = 0; pNr < pageCount; pNr++)
                    {
                        getPage(pNr);
                    }
                    allPagesParsed = true;
                    document.setDecrypted();
                }
            }
            exceptionOccurred = false;
        }
        finally
        {
            try
            {
                closeFileStream();
            }
            catch (IOException ioe)
            {
            }


            deleteTempFile();


            if (exceptionOccurred && (document != null))
            {
                try
                {
                    document.close();
                    document = null;
                }
                catch (IOException ioe)
                {
                }
            }
        }
    }


    /**
     * Return the pdf file.
     * 
     * @return the pdf file
     */
    protected File getPdfFile()
    {
        return this.pdfFile;
    }


    /**
     * Return true if parser is lenient. Meaning auto healing capacity of the parser are used.
     *
     * @return true if parser is lenient
     */
    public boolean isLenient()
    {
        return isLenient;
    }


    /**
     * Change the parser leniency flag.
     *
     * This method can only be called before the parsing of the file.
     *
     * @param lenient try to handle malformed PDFs.
     *
     * @throws IllegalArgumentException if the method is called after parsing.
     */
    public void setLenient(boolean lenient) throws IllegalArgumentException
    {
        if (initialParseDone)
        {
            throw new IllegalArgumentException("Cannot change leniency after parsing");
        }
        this.isLenient = lenient;
    }


    /**
     * Remove the temporary file. A temporary file is created if this class is instantiated with an InputStream
     */
    protected void deleteTempFile()
    {
        if (isTmpPDFFile)
        {
            try
            {
                if (!pdfFile.delete())
                {
                    LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted");
                }
            }
            catch (SecurityException e)
            {
                LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted", e);
            }
        }
    }


    // ------------------------------------------------------------------------
    /**
     * Returns the number of pages in a document.
     * 
     * @return the number of pages.
     * 
     * @throws IOException if PAGES or other needed object is missing
     */
    public int getPageNumber() throws IOException
    {
        int pageCount = getPagesObject().getInt(COSName.COUNT);


        if (pageCount < 0)
        {
            throw new IOException("No page number specified.");
        }
        return pageCount;
    }


    // ------------------------------------------------------------------------
    /**
     * Returns the page requested with all the objects loaded into it.
     * 
     * @param pageNr starts from 0 to the number of pages.
     * @return the page with the given pagenumber.
     * @throws IOException If something went wrong.
     */
    public PDPage getPage(int pageNr) throws IOException
    {
        getPagesObject();


        // ---- get list of top level pages
        COSArray kids = (COSArray) pagesDictionary.getDictionaryObject(COSName.KIDS);


        if (kids == null)
        {
            throw new IOException("Missing 'Kids' entry in pages dictionary.");
        }


        // ---- get page we are looking for (possibly going recursively into
        // subpages)
        COSObject pageObj = getPageObject(pageNr, kids, 0);


        if (pageObj == null)
        {
            throw new IOException("Page " + pageNr + " not found.");
        }


        // ---- parse all objects necessary to load page.
        COSDictionary pageDict = (COSDictionary) pageObj.getObject();


        if (parseMinimalCatalog && (!allPagesParsed))
        {
            // parse page resources since we did not do this on start
            COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject(COSName.RESOURCES);
            parseDictObjects(resDict);
        }


        return new PDPage(pageDict);
    }


    /**
     * Returns the object for a specific page. The page tree is made up of kids. The kids have COSArray with COSObjects
     * inside of them. The COSObject can be parsed using the dynamic parsing method We want to only parse the minimum
     * COSObjects and still return a complete page. ready to be used.
     * 
     * @param num the requested page number; numbering starts with 0
     * @param startKids Kids array to start with looking up page number
     * @param startPageCount
     * 
     * @return page object or <code>null</code> if no such page exists
     * 
     * @throws IOException
     */
    private COSObject getPageObject(int num, COSArray startKids, int startPageCount)
            throws IOException
    {
        int curPageCount = startPageCount;
        Iterator<COSBase> kidsIter = startKids.iterator();


        while (kidsIter.hasNext())
        {
            COSObject obj = (COSObject) kidsIter.next();
            COSBase base = obj.getObject();
            if (base == null)
            {
                base = parseObjectDynamically(obj, false);
                obj.setObject(base);
            }


            COSDictionary dic = (COSDictionary) base;
            int count = dic.getInt(COSName.COUNT);
            if (count >= 0)
            {
                // skip this branch if requested page comes later
                if ((curPageCount + count) <= num)
                {
                    curPageCount += count;
                    continue;
                }
            }


            COSArray kids = (COSArray) dic.getDictionaryObject(COSName.KIDS);
            if (kids != null)
            {
                // recursively scan subpages
                COSObject ans = getPageObject(num, kids, curPageCount);
                // if ans is not null, we got what we were looking for
                if (ans != null)
                {
                    return ans;
                }
            }
            else
            {
                // found page?
                if (curPageCount == num)
                {
                    return obj;
                }
                // page has no kids and it is not the page we are looking for
                curPageCount++;
            }
        }
        return null;
    }


    /**
     * Creates a unique object id using object number and object generation number. (requires object number &lt; 2^31))
     */
    private long getObjectId(final COSObject obj)
    {
        return (obj.getObjectNumber().longValue() << 32) | obj.getGenerationNumber().longValue();
    }


    /**
     * Adds all from newObjects to toBeParsedList if it is not an COSObject or we didn't add this COSObject already
     * (checked via addedObjects).
     */
    private void addNewToList(final Queue<COSBase> toBeParsedList,
            final Collection<COSBase> newObjects, final Set<Long> addedObjects)
    {
        for (COSBase newObject : newObjects)
        {
            if (newObject instanceof COSObject)
            {
                final long objId = getObjectId((COSObject) newObject);
                if (!addedObjects.add(objId))
                {
                    continue;
                }
            }
            toBeParsedList.add(newObject);
        }
    }


    /**
     * Adds newObject to toBeParsedList if it is not an COSObject or we didn't add this COSObject already (checked via
     * addedObjects).
     */
    private void addNewToList(final Queue<COSBase> toBeParsedList, final COSBase newObject,
            final Set<Long> addedObjects)
    {
        if (newObject instanceof COSObject)
        {
            final long objId = getObjectId((COSObject) newObject);
            if (!addedObjects.add(objId))
            {
                return;
            }
        }
        toBeParsedList.add(newObject);
    }


    /**
     * Will parse every object necessary to load a single page from the pdf document. We try our best to order objects
     * according to offset in file before reading to minimize seek operations.
     * 
     * @param dict the COSObject from the parent pages.
     * @param excludeObjects dictionary object reference entries with these names will not be parsed
     * 
     * @throws IOException
     */
    private void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException
    {
        // ---- create queue for objects waiting for further parsing
        final Queue<COSBase> toBeParsedList = new LinkedList<COSBase>();
        // offset ordered object map
        final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<Long, List<COSObject>>();
        // in case of compressed objects offset points to stmObj
        final Set<Long> parsedObjects = new HashSet<Long>();
        final Set<Long> addedObjects = new HashSet<Long>();


        // ---- add objects not to be parsed to list of already parsed objects
        if (excludeObjects != null)
        {
            for (COSName objName : excludeObjects)
            {
                COSBase baseObj = dict.getItem(objName);
                if (baseObj instanceof COSObject)
                {
                    parsedObjects.add(getObjectId((COSObject) baseObj));
                }
            }
        }


        addNewToList(toBeParsedList, dict.getValues(), addedObjects);


        // ---- go through objects to be parsed
        while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty()))
        {
            // -- first get all COSObject from other kind of objects and
            // put them in objToBeParsed; afterwards toBeParsedList is empty
            COSBase baseObj;
            while ((baseObj = toBeParsedList.poll()) != null)
            {
                if (baseObj instanceof COSStream)
                {
                    addNewToList(toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects);
                }
                else if (baseObj instanceof COSDictionary)
                {
                    addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(),
                            addedObjects);
                }
                else if (baseObj instanceof COSArray)
                {
                    final Iterator<COSBase> arrIter = ((COSArray) baseObj).iterator();
                    while (arrIter.hasNext())
                    {
                        addNewToList(toBeParsedList, arrIter.next(), addedObjects);
                    }
                }
                else if (baseObj instanceof COSObject)
                {
                    COSObject obj = (COSObject) baseObj;
                    long objId = getObjectId(obj);
                    COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(), obj
                            .getGenerationNumber().intValue());


                    if (!(parsedObjects.contains(objId) /*
                                                         * || document.hasObjectInPool ( objKey )
                                                         */))
                    {
                        Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey);
                        // it is allowed that object references point to null,
                        // thus we have to test
                        if (fileOffset != null && fileOffset != 0)
                        {
                            if (fileOffset > 0)
                            {
                                objToBeParsed.put(fileOffset, Collections.singletonList(obj));
                            }
                            else
                            {
                                // negative offset means we have a compressed
                                // object within object stream;
                                // get offset of object stream
                                fileOffset = xrefTrailerResolver.getXrefTable().get(
                                        new COSObjectKey(-fileOffset, 0));
                                if ((fileOffset == null) || (fileOffset <= 0))
                                {
                                    throw new IOException(
                                            "Invalid object stream xref object reference for key '" + objKey + "': "
                                                    + fileOffset);
                                }


                                List<COSObject> stmObjects = objToBeParsed.get(fileOffset);
                                if (stmObjects == null)
                                {
                                    objToBeParsed.put(fileOffset,
                                            stmObjects = new ArrayList<COSObject>());
                                }
                                stmObjects.add(obj);
                            }
                        }
                        else
                        {
                            // NULL object
                            COSObject pdfObject = document.getObjectFromPool(objKey);
                            pdfObject.setObject(COSNull.NULL);
                        }
                    }
                }
            }


            // ---- read first COSObject with smallest offset;
            // resulting object will be added to toBeParsedList
            if (objToBeParsed.isEmpty())
            {
                break;
            }


            for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey()))
            {
                COSBase parsedObj = parseObjectDynamically(obj, false);


                obj.setObject(parsedObj);
                addNewToList(toBeParsedList, parsedObj, addedObjects);


                parsedObjects.add(getObjectId(obj));
            }
        }
    }


    /**
     * This will parse the next object from the stream and add it to the local state. This is taken from
     * {@link PDFParser} and reduced to parsing an indirect object.
     * 
     * @param obj object to be parsed (we only take object number and generation number for lookup start offset)
     * @param requireExistingNotCompressedObj if <code>true</code> object to be parsed must not be contained within
     * compressed stream
     * @return the parsed object (which is also added to document object)
     * 
     * @throws IOException If an IO error occurs.
     */
    protected final COSBase parseObjectDynamically(COSObject obj,
            boolean requireExistingNotCompressedObj) throws IOException
    {
        return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber()
                .intValue(), requireExistingNotCompressedObj);
    }


    /**
     * This will parse the next object from the stream and add it to the local state. This is taken from
     * {@link PDFParser} and reduced to parsing an indirect object.
     * 
     * @param objNr object number of object to be parsed
     * @param objGenNr object generation number of object to be parsed
     * @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined in xref
     * (comment: null objects may be missing from xref) and it must not be a compressed object within object stream
     * (this is used to circumvent being stuck in a loop in a malicious PDF)
     * 
     * @return the parsed object (which is also added to document object)
     * 
     * @throws IOException If an IO error occurs.
     */
    protected COSBase parseObjectDynamically(int objNr, int objGenNr,
            boolean requireExistingNotCompressedObj) throws IOException
    {
        // ---- create object key and get object (container) from pool
        final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
        final COSObject pdfObject = document.getObjectFromPool(objKey);


        if (pdfObject.getObject() == null)
        {
            // not previously parsed
            // ---- read offset or object stream object number from xref table
            Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey);


            // sanity test to circumvent loops with broken documents
            if (requireExistingNotCompressedObj
                    && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0)))
            {
                throw new IOException("Object must be defined and must not be compressed object: "
                        + objKey.getNumber() + ":" + objKey.getGeneration());
            }


            if (offsetOrObjstmObNr == null)
            {
                // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
                pdfObject.setObject(COSNull.NULL);
            }
            else if (offsetOrObjstmObNr > 0)
            {
                // offset of indirect object in file
                // ---- go to object start
                setPdfSource(offsetOrObjstmObNr);


                // ---- we must have an indirect object
                final long readObjNr = readObjectNumber();
                final long readObjGen = readGenerationNumber();
                readPattern(OBJ_MARKER);


                // ---- consistency check
                if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration()))
                {
                    throw new IOException("XREF for " + objKey.getNumber() + ":"
                            + objKey.getGeneration() + " points to wrong object: " + readObjNr
                            + ":" + readObjGen);
                }


                skipSpaces();
                COSBase pb = parseDirObject();
                String endObjectKey = readString();


                if (endObjectKey.equals("stream"))
                {
                    pdfSource.unread(endObjectKey.getBytes("ISO-8859-1"));
                    pdfSource.unread(' ');
                    if (pb instanceof COSDictionary)
                    {
                        COSStream stream = parseCOSStream((COSDictionary) pb);


                        if (securityHandler != null)
                        {
                            securityHandler.decryptStream(stream, objNr, objGenNr);
                        }
                        pb = stream;
                    }
                    else
                    {
                        // this is not legal
                        // the combination of a dict and the stream/endstream
                        // forms a complete stream object
                        throw new IOException("Stream not preceded by dictionary (offset: "
                                + offsetOrObjstmObNr + ").");
                    }
                    skipSpaces();
                    endObjectKey = readLine();


                    // we have case with a second 'endstream' before endobj
                    if (!endObjectKey.startsWith("endobj"))
                    {
                        if (endObjectKey.startsWith("endstream"))
                        {
                            endObjectKey = endObjectKey.substring(9).trim();
                            if (endObjectKey.length() == 0)
                            {
                                // no other characters in extra endstream line
                                endObjectKey = readLine(); // read next line
                            }
                        }
                    }
                }
                else if (securityHandler != null)
                {
                    // decrypt
                    if (pb instanceof COSString)
                    {
                        decrypt((COSString) pb, objNr, objGenNr);
                    }
                    else if (pb instanceof COSDictionary)
                    {
                        COSDictionary dict = (COSDictionary) pb;
                        // skip dictionary containing the signature
                        if (!COSName.SIG.equals(dict.getCOSName(COSName.TYPE)))
                        {
                            for (Entry<COSName, COSBase> entry : dict.entrySet())
                            {
                                if (entry.getValue() instanceof COSString)
                                {
                                    decrypt((COSString) entry.getValue(), objNr, objGenNr);
                                }
                                else if (entry.getValue() instanceof COSArray)
                                {
                                    securityHandler.decryptArray((COSArray) entry.getValue(), objNr, objGenNr);
                                }                            
                            }
                        }
                    }
                    else if (pb instanceof COSArray)
                    {
                        final COSArray array = (COSArray) pb;
                        for (int aIdx = 0, len = array.size(); aIdx < len; aIdx++)
                        {
                            if (array.get(aIdx) instanceof COSString)
                            {
                                decrypt((COSString) array.get(aIdx), objNr, objGenNr);
                            }
                        }
                    }
                }


                pdfObject.setObject(pb);


                if (!endObjectKey.startsWith("endobj"))
                {
                    if (isLenient)
                    {
                        LOG.warn("Object (" + readObjNr + ":" + readObjGen + ") at offset "
                                + offsetOrObjstmObNr + " does not end with 'endobj' but with '"
                                + endObjectKey + "'");
                    }
                    else
                    {
                        throw new IOException("Object (" + readObjNr + ":" + readObjGen
                                + ") at offset " + offsetOrObjstmObNr
                                + " does not end with 'endobj' but with '" + endObjectKey + "'");
                    }
                }


                releasePdfSourceInputStream();


            }
            else
            {
                // xref value is object nr of object stream containing object to
                // be parsed;
                // since our object was not found it means object stream was not
                // parsed so far
                final int objstmObjNr = (int) (-offsetOrObjstmObNr);
                final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true);
                if (objstmBaseObj instanceof COSStream)
                {
                    // parse object stream
                    PDFObjectStreamParser parser = new PDFObjectStreamParser(
                            (COSStream) objstmBaseObj, document, forceParsing);
                    parser.parse();


                    // get set of object numbers referenced for this object
                    // stream
                    final Set<Long> refObjNrs = xrefTrailerResolver
                            .getContainedObjectNumbers(objstmObjNr);


                    // register all objects which are referenced to be contained
                    // in object stream
                    for (COSObject next : parser.getObjects())
                    {
                        COSObjectKey stmObjKey = new COSObjectKey(next);
                        if (refObjNrs.contains(stmObjKey.getNumber()))
                        {
                            COSObject stmObj = document.getObjectFromPool(stmObjKey);
                            stmObj.setObject(next.getObject());
                        }
                    }
                }
            }
        }
        return pdfObject.getObject();
    }


    // ------------------------------------------------------------------------
    /**
     * Decrypts given COSString.
     * 
     * @param str the string to be decrypted
     * @param objNr the object number
     * @param objGenNr the object generation number
     * @throws IOException ff something went wrong
     */
    protected final void decrypt(COSString str, long objNr, long objGenNr) throws IOException
    {
        securityHandler.decryptString(str, objNr, objGenNr);
    }


    // ------------------------------------------------------------------------
    private boolean inGetLength = false;


    /** Returns length value referred to or defined in given object. */
    private COSNumber getLength(final COSBase lengthBaseObj) throws IOException
    {
        if (lengthBaseObj == null)
        {
            return null;
        }


        if (inGetLength)
        {
            throw new IOException("Loop while reading length from " + lengthBaseObj);
        }


        COSNumber retVal = null;


        try
        {
            inGetLength = true;


            // ---- maybe length was given directly
            if (lengthBaseObj instanceof COSNumber)
            {
                retVal = (COSNumber) lengthBaseObj;
            }
            // ---- length in referenced object
            else if (lengthBaseObj instanceof COSObject)
            {
                COSObject lengthObj = (COSObject) lengthBaseObj;


                if (lengthObj.getObject() == null)
                {
                    // not read so far


                    // keep current stream position
                    final long curFileOffset = getPdfSourceOffset();
                    releasePdfSourceInputStream();


                    parseObjectDynamically(lengthObj, true);


                    // reset current stream position
                    setPdfSource(curFileOffset);


                    if (lengthObj.getObject() == null)
                    {
                        throw new IOException("Length object content was not read.");
                    }
                }


                if (!(lengthObj.getObject() instanceof COSNumber))
                {
                    throw new IOException("Wrong type of referenced length object " + lengthObj
                            + ": " + lengthObj.getObject().getClass().getSimpleName());
                }


                retVal = (COSNumber) lengthObj.getObject();


            }
            else
            {
                throw new IOException("Wrong type of length object: "
                        + lengthBaseObj.getClass().getSimpleName());
            }
        }
        finally
        {
            inGetLength = false;
        }
        return retVal;
    }


    // ------------------------------------------------------------------------
    private final int streamCopyBufLen = 8192;
    private final byte[] streamCopyBuf = new byte[streamCopyBufLen];


    /**
     * This will read a COSStream from the input stream using length attribute within dictionary. If length attribute is
     * a indirect reference it is first resolved to get the stream length. This means we copy stream data without
     * testing for 'endstream' or 'endobj' and thus it is no problem if these keywords occur within stream. We require
     * 'endstream' to be found after stream data is read.
     * 
     * @param dic dictionary that goes with this stream.
     * 
     * @return parsed pdf stream.
     * 
     * @throws IOException if an error occurred reading the stream, like problems with reading length attribute, stream
     * does not end with 'endstream' after data read, stream too short etc.
     */
    @Override
    protected COSStream parseCOSStream(COSDictionary dic) throws IOException
    {
        final COSStream stream = createCOSStream(dic);
        OutputStream out = null;
        try
        {
            readString(); // read 'stream'; this was already tested in
                          // parseObjectsDynamically()


            // ---- skip whitespaces before start of data
            // PDF Ref 1.7, chap. 3.2.7:
            // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF
            // but nothing else.
            int whitespace = pdfSource.read();
            
            // see brother_scan_cover.pdf, it adds whitespaces
            // after the stream but before the start of the
            // data, so just read those first
            while (whitespace == 0x20)
            {
                whitespace = pdfSource.read();
            }


            if (whitespace == 0x0D)
            {
                whitespace = pdfSource.read();
                if (whitespace != 0x0A)
                {
                    // the spec says this is invalid but it happens in the
                    // real world so we must support it
                    pdfSource.unread(whitespace);
                }
            }
            else if (whitespace != 0x0A)
            {
                // no whitespace after 'stream'; PDF ref. says 'should' so
                // that is ok
                pdfSource.unread(whitespace);
            }


            /*
             * This needs to be dic.getItem because when we are parsing, the underlying object might still be null.
             */
            COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH));
            if (streamLengthObj == null)
            {
                throw new IOException("Missing length for stream.");
            }


            boolean useReadUntilEnd = false;
            // ---- get output stream to copy data to
            if (validateStreamLength(streamLengthObj.longValue()))
            {
                out = stream.createFilteredStream(streamLengthObj);
                long remainBytes = streamLengthObj.longValue();
                int bytesRead = 0;
                while (remainBytes > 0)
                {
                    final int readBytes = pdfSource
                            .read(streamCopyBuf,
                                    0,
                                    (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes);
                    if (readBytes <= 0)
                    {
                        useReadUntilEnd = true;
                        out.close();
                        pdfSource.unread(bytesRead);
                        break;
                    }
                    out.write(streamCopyBuf, 0, readBytes);
                    remainBytes -= readBytes;
                    bytesRead += readBytes;
                }
            }
            else
            {
                useReadUntilEnd = true;
            }
            if (useReadUntilEnd)
            {
                out = stream.createFilteredStream();
                readUntilEndStream(new EndstreamOutputStream(out));
            }
            String endStream = readString();
            if (endStream.equals("endobj") && isLenient)
            {
                LOG.warn("stream ends with 'endobj' instead of 'endstream' at offset "
                        + pdfSource.getOffset());
                // avoid follow-up warning about missing endobj
                pdfSource.unread("endobj".getBytes("ISO-8859-1"));
            }
            else if (endStream.length() > 9 && isLenient && endStream.substring(0,9).equals("endstream"))
            {
                LOG.warn("stream ends with '" + endStream + "' instead of 'endstream' at offset "
                        + pdfSource.getOffset());
                // unread the "extra" bytes
                pdfSource.unread(endStream.substring(9).getBytes("ISO-8859-1"));
            }
            else if (!endStream.equals("endstream"))
            {
                throw new IOException(
                        "Error reading stream, expected='endstream' actual='"
                        + endStream + "' at offset " + pdfSource.getOffset());
            }
        }
        finally
        {
            if (out != null)
            {
                out.close();
            }
        }
        return stream;
    }


    private boolean validateStreamLength(long streamLength) throws IOException
    {
        boolean streamLengthIsValid = true;
        long originOffset = pdfSource.getOffset();
        long expectedEndOfStream = originOffset + streamLength;
        if (expectedEndOfStream > fileLen)
        {
            streamLengthIsValid = false;
            LOG.error("The end of the stream is out of range, using workaround to read the stream");
            LOG.error("Stream start offset: " + originOffset);
            LOG.error("Expected endofstream offset: " + expectedEndOfStream);
        }
        else
        {
            pdfSource.seek(expectedEndOfStream);
            skipSpaces();
            if (!checkBytesAtOffset("endstream".getBytes("ISO-8859-1")))
            {
                streamLengthIsValid = false;
                LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream");
                LOG.error("Stream start offset: " + originOffset);
                LOG.error("Expected endofstream offset: " + expectedEndOfStream);
            }
            pdfSource.seek(originOffset);
        }
        return streamLengthIsValid;
    }


    /**
     * Check if the cross reference table/stream can be found at the current offset.
     * 
     * @param startXRefOffset
     * @return the revised offset
     * @throws IOException
     */
    private long checkXRefOffset(long startXRefOffset) throws IOException
    {
        // repair mode isn't available in non-lenient mode
        if (!isLenient)
        {
            return startXRefOffset;
        }
        setPdfSource(startXRefOffset);
        if (pdfSource.peek() == X && checkBytesAtOffset(XREF_TABLE))
        {
            return startXRefOffset;
        }
        int nextValue = pdfSource.peek();
        // maybe there isn't a xref table but a xref stream
        // is the next character a digit?
        if (nextValue > 47 && nextValue < 57)
        {
            try
            {
                // Maybe it's a XRef stream
                readObjectNumber();
                readGenerationNumber();
                readPattern(OBJ_MARKER);
                setPdfSource(startXRefOffset);
                return startXRefOffset;
            }
            catch (IOException exception)
            {
                // there wasn't an object of a xref stream
                // try to repair the offset
                pdfSource.seek(startXRefOffset);
            }
        }
        // try to find a fixed offset
        return calculateXRefFixedOffset(startXRefOffset);
    }


    /**
     * Check if the given bytes can be found at the current offset.
     * 
     * @param string the bytes to look for
     * @return true if the bytes are in place, false if not
     * @throws IOException if something went wrong
     */
    private boolean checkBytesAtOffset(byte[] string) throws IOException
    {
        boolean bytesMatching = false;
        if (pdfSource.peek() == string[0])
        {
            int length = string.length;
            byte[] bytesRead = new byte[length];
            int numberOfBytes = pdfSource.read(bytesRead, 0, length);
            while (numberOfBytes < length)
            {
                int readMore = pdfSource.read(bytesRead, numberOfBytes, length - numberOfBytes);
                if (readMore < 0)
                {
                    break;
                }
                numberOfBytes += readMore;
            }
            if (Arrays.equals(string, bytesRead))
            {
                bytesMatching = true;
            }
            pdfSource.unread(bytesRead, 0, numberOfBytes);
        }
        return bytesMatching;
    }


    /**
     * Try to find a fixed offset for the given xref table/stream.
     * 
     * @param objectOffset the given offset where to look at
     * @return the fixed offset
     * 
     * @throws IOException if something went wrong
     */
    private long calculateXRefFixedOffset(long objectOffset) throws IOException
    {
        if (objectOffset < 0)
        {
            LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
            return 0;
        }
        // start a brute force search for all xref tables and try to find the offset we are looking for
        long newOffset = bfSearchForXRef(objectOffset);
        if (newOffset > -1)
        {
            LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
            return newOffset;
        }
        LOG.error("Can't find the object axref table/stream at offset " + objectOffset);
        return 0;
    }


    /**
     * Check the XRef table by dereferencing all objects and fixing the offset if necessary.
     * 
     * @throws IOException if something went wrong.
     */
    private void checkXrefOffsets() throws IOException
    {
        // repair mode isn't available in non-lenient mode
        if (!isLenient)
        {
            return;
        }
        Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
        if (xrefOffset != null)
        {
            for (COSObjectKey objectKey : xrefOffset.keySet())
            {
                Long objectOffset = xrefOffset.get(objectKey);
                // a negative offset number represents a object number itself
                // see type 2 entry in xref stream
                if (objectOffset != null && objectOffset > 0)
                {
                    long objectNr = objectKey.getNumber();
                    long objectGen = objectKey.getGeneration();
                    String objectString = createObjectString(objectNr, objectGen);
                    if (!checkObjectId(objectString, objectOffset))
                    {
                        long newOffset = bfSearchForObject(objectString);
                        if (newOffset > -1)
                        {
                            xrefOffset.put(objectKey, newOffset);
                            LOG.debug("Fixed reference for object " + objectNr + " " + objectGen
                                    + " " + objectOffset + " -> " + newOffset);
                        }
                        else
                        {
                            LOG.error("Can't find the object " + objectNr + " " + objectGen
                                    + " (origin offset " + objectOffset + ")");
                        }
                    }
                }
            }
        }
    }


    /**
     * Check if the given string can be found at the given offset.
     * 
     * @param objectString the string we are looking for
     * @param offset the given where to look
     * @return returns true if the given string can be found at the givwen offset
     * @throws IOException if something went wrong
     */
    private boolean checkObjectId(String objectString, long offset) throws IOException
    {
        boolean objectFound = false;
        long originOffset = pdfSource.getOffset();
        pdfSource.seek(offset);
        objectFound = checkBytesAtOffset(objectString.getBytes("ISO-8859-1"));
        pdfSource.seek(originOffset);
        return objectFound;
    }


    /**
     * Create a string for the given object id.
     * 
     * @param objectID the object id
     * @param genID the generation id
     * @return the generated string
     */
    private String createObjectString(long objectID, long genID)
    {
        return Long.toString(objectID) + " " + Long.toString(genID) + " obj";
    }


    /**
     * Search for the offset of the given object among the objects found by a brute force search.
     * 
     * @param objectString the object we are looking for
     * @return the offset of the object
     * @throws IOException if something went wrong
     */
    private long bfSearchForObject(String objectString) throws IOException
    {
        long newOffset = -1;
        bfSearchForObjects();
        if (bfSearchObjectOffsets.containsKey(objectString))
        {
            newOffset = bfSearchObjectOffsets.get(objectString);
        }
        return newOffset;
    }


    /**
     * Brute force search for every object in the pdf.
     *   
     * @throws IOException if something went wrong
     */
    private void bfSearchForObjects() throws IOException
    {
        if (bfSearchObjectOffsets == null)
        {
            bfSearchObjectOffsets = new HashMap<String, Long>();
            bfSearchCOSObjectKeyOffsets = new HashMap<COSObjectKey, Long>();
            long originOffset = pdfSource.getOffset();
            long currentOffset = MINIMUM_SEARCH_OFFSET;
            String objString = " obj";
            byte[] string = objString.getBytes("ISO-8859-1");
            do
            {
                pdfSource.seek(currentOffset);
                if (checkBytesAtOffset(string))
                {
                    long tempOffset = currentOffset - 1;
                    pdfSource.seek(tempOffset);
                    int genID = pdfSource.peek();
                    // is the next char a digit?
                    if (genID > 47 && genID < 58)
                    {
                        genID -= 48;
                        tempOffset--;
                        pdfSource.seek(tempOffset);
                        if (pdfSource.peek() == 32)
                        {
                            while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() == 32)
                            {
                                pdfSource.seek(--tempOffset);
                            }
                            int length = 0;
                            while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() > 47
                                    && pdfSource.peek() < 58)
                            {
                                pdfSource.seek(--tempOffset);
                                length++;
                            }
                            if (length > 0)
                            {
                                pdfSource.read();
                                byte[] objIDBytes = pdfSource.readFully(length);
                                String objIdString = new String(objIDBytes, 0,
                                        objIDBytes.length, "ISO-8859-1");
                                Long objectID = null;
                                try
                                {
                                    objectID = Long.valueOf(objIdString);
                                }
                                catch (NumberFormatException excpetion)
                                {
                                    objectID = null;
                                }
                                if (objectID != null)
                                {
                                    bfSearchObjectOffsets.put(
                                            createObjectString(objectID, genID), ++tempOffset);
                                    bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(objectID, genID), tempOffset);
                                }
                            }
                        }
                    }
                }
                currentOffset++;
            } while (!pdfSource.isEOF());
            // reestablish origin position
            pdfSource.seek(originOffset);
        }
    }


    /**
     * Search for the offset of the given xref table/stream among those found by a brute force search.
     * 
     * @return the offset of the xref entry
     * @throws IOException if something went wrong
     */
    private long bfSearchForXRef(long xrefOffset) throws IOException
    {
        long newOffset = -1;
        bfSearchForXRefs();
        if (bfSearchXRefOffsets != null)
        {
            long currentDifference = -1;
            int currentOffsetIndex = -1;
            int numberOfOffsets = bfSearchXRefOffsets.size();
            // find the most likely value
            // TODO to be optimized, this won't work in every case
            for (int i = 0; i < numberOfOffsets; i++)
            {
                long newDifference = xrefOffset - bfSearchXRefOffsets.get(i);
                // find the nearest offset
                if (currentDifference == -1
                        || (Math.abs(currentDifference) > Math.abs(newDifference)))
                {
                    currentDifference = newDifference;
                    currentOffsetIndex = i;
                }
            }
            if (currentOffsetIndex > -1)
            {
                newOffset = bfSearchXRefOffsets.remove(currentOffsetIndex);
            }
        }
        return newOffset;
    }


    /**
     * Brute force search for all xref entries.
     * 
     * @throws IOException if something went wrong
     */
    private void bfSearchForXRefs() throws IOException
    {
        if (bfSearchXRefOffsets == null)
        {
            // a pdf may contain more than one xref entry
            bfSearchXRefOffsets = new Vector<Long>();
            long originOffset = pdfSource.getOffset();
            pdfSource.seek(MINIMUM_SEARCH_OFFSET);
            // search for xref tables
            while (!pdfSource.isEOF())
            {
                if (checkBytesAtOffset(XREF_TABLE))
                {
                    long newOffset = pdfSource.getOffset();
                    pdfSource.seek(newOffset - 1);
                    // ensure that we don't read "startxref" instead of "xref"
                    if (isWhitespace())
                    {
                        bfSearchXRefOffsets.add(newOffset);
                    }
                    pdfSource.seek(newOffset + 4);
                }
                pdfSource.read();
            }
            pdfSource.seek(MINIMUM_SEARCH_OFFSET);
            // search for XRef streams
            String objString = " obj";
            byte[] string = objString.getBytes("ISO-8859-1");
            while (!pdfSource.isEOF())
            {
                if (checkBytesAtOffset(XREF_STREAM))
                {
                    // search backwards for the beginning of the stream
                    long newOffset = -1;
                    long xrefOffset = pdfSource.getOffset();
                    long currentOffset = xrefOffset;
                    boolean objFound = false;
                    for (int i = 1; i < 30 && !objFound; i++)
                    {
                        currentOffset = xrefOffset - (i * 10);
                        if (currentOffset > 0)
                        {
                            pdfSource.seek(currentOffset);
                            for (int j = 0; j < 10; j++)
                            {
                                if (checkBytesAtOffset(string))
                                {
                                    long tempOffset = currentOffset - 1;
                                    pdfSource.seek(tempOffset);
                                    int genID = pdfSource.peek();
                                    // is the next char a digit?
                                    if (genID > 47 && genID < 58)
                                    {
                                        genID -= 48;
                                        tempOffset--;
                                        pdfSource.seek(tempOffset);
                                        if (pdfSource.peek() == 32)
                                        {
                                            int length = 0;
                                            pdfSource.seek(--tempOffset);
                                            while (tempOffset > MINIMUM_SEARCH_OFFSET
                                                    && pdfSource.peek() > 47
                                                    && pdfSource.peek() < 58)
                                            {
                                                pdfSource.seek(--tempOffset);
                                                length++;
                                            }
                                            if (length > 0)
                                            {
                                                pdfSource.read();
                                                newOffset = pdfSource.getOffset();
                                            }
                                        }
                                    }
                                    LOG.debug("Fixed reference for xref stream " + xrefOffset
                                            + " -> " + newOffset);
                                    objFound = true;
                                    break;
                                }
                                else
                                {
                                    currentOffset++;
                                    pdfSource.read();
                                }
                            }
                        }
                    }
                    if (newOffset > -1)
                    {
                        bfSearchXRefOffsets.add(newOffset);
                    }
                    pdfSource.seek(xrefOffset + 5);
                }
                pdfSource.read();
            }
            pdfSource.seek(originOffset);
        }
    }
}
Source Code of org.apache.pdfbox.pdfparser.NonSequentialPDFParser

Related Classes of org.apache.pdfbox.pdfparser.NonSequentialPDFParser