/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.KeyStore;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.persistence.util.COSObjectKey;
/**
* PDFParser which first reads startxref and xref tables in order to know valid
* objects and parse only these objects. Thus it is closer to a conforming parser
* than the sequential reading of {@link PDFParser}.
*
* This class can be used as a {@link PDFParser} replacement. First {@link #parse()}
* must be called before page objects can be retrieved, e.g. {@link #getPDDocument()}.
*
* This class is a much enhanced version of <code>QuickParser</code> presented in
* <a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a>
* by Jeremy Villalobos.
*/
public class NonSequentialPDFParser extends PDFParser
{
public static final String SYSPROP_PARSEMINIMAL =
"org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
public static final String SYSPROP_EOFLOOKUPRANGE =
"org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";
private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream( new byte[0] );
private static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
private static final char[] EOF_MARKER = new char[] { '%','%','E','O','F' };
private static final char[] STARTXREF_MARKER = new char[] { 's','t','a','r','t','x','r','e','f' };
private static final char[] OBJ_MARKER = new char[] { 'o','b','j' };
private final File pdfFile;
private final RandomAccessBufferedFileInputStream raStream;
private SecurityHandler securityHandler = null;
private String keyStoreFilename = null;
private String alias = null;
private String password = "";
private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing bytes to read for EOF marker
/** If <code>true</code> object references in catalog are not followed;
* pro: page objects will be only parsed when needed; cons: some information of catalog
* might not be available (e.g. outline).
* Catalog parsing without pages is not an option since a number of entries will
* also refer to page objects (like OpenAction).
*/
private boolean parseMinimalCatalog = "true".equals( System.getProperty( SYSPROP_PARSEMINIMAL ) );
private boolean initialParseDone = false;
private boolean allPagesParsed = false;
private static final Log LOG = LogFactory.getLog( NonSequentialPDFParser.class );
// ------------------------------------------------------------------------
/**
* Constructs parser for given file using memory buffer.
*
* @param filename the filename of the pdf to be parsed
*
* @throws IOException If something went wrong.
*/
public NonSequentialPDFParser( String filename ) throws IOException
{
this( new File( filename ), null );
}
/**
* Constructs parser for given file using given buffer for temporary storage.
*
* @param file the pdf to be parsed
* @param raBuf the buffer to be used for parsing
*
* @throws IOException If something went wrong.
*/
/**
* Constructs parser for given file using given buffer for temporary storage.
*
* @param file the pdf to be parsed
* @param raBuf the buffer to be used for parsing
*
* @throws IOException If something went wrong.
*/
public NonSequentialPDFParser( File file, RandomAccess raBuf ) throws IOException
{
this(file, raBuf, "");
}
/**
* Constructs parser for given file using given buffer for temporary storage.
*
* @param file the pdf to be parsed
* @param raBuf the buffer to be used for parsing
*
* @throws IOException If something went wrong.
*/
/**
* Constructs parser for given file using given buffer for temporary storage.
*
* @param file the pdf to be parsed
* @param raBuf the buffer to be used for parsing
* @param decryptionPassword password to be used for decryption
*
* @throws IOException If something went wrong.
*/
public NonSequentialPDFParser( File file, RandomAccess raBuf, String decryptionPassword ) throws IOException
{
super( EMPTY_INPUT_STREAM, null, false );
String eofLookupRangeStr = System.getProperty( SYSPROP_EOFLOOKUPRANGE );
if ( eofLookupRangeStr != null )
{
try
{
setEOFLookupRange( Integer.parseInt( eofLookupRangeStr ) );
}
catch ( NumberFormatException nfe )
{
LOG.warn( "System property " + SYSPROP_EOFLOOKUPRANGE +
" does not contain an integer value, but: '" + eofLookupRangeStr + "'" );
}
}
pdfFile = file;
raStream = new RandomAccessBufferedFileInputStream( pdfFile );
setDocument( ( raBuf == null ) ? new COSDocument( new RandomAccessBuffer(), false ) :
new COSDocument( raBuf, false ) );
pdfSource = new PushBackInputStream( raStream, 4096 );
password = decryptionPassword;
}
// ------------------------------------------------------------------------
/**
* Sets how many trailing bytes of PDF file are searched for
* EOF marker and 'startxref' marker.
* If not set we use default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
*
* <p<We check that new value is at least 16. However for practical use
* cases this value should not be lower than 1000; even 2000
* was found to not be enough in some cases where some trailing
* garbage like HTML snippets followed the EOF marker.</p>
*
* <p>In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined
* this value will be set on initialization but can be overwritten later.</p>
*
* @param byteCount number of trailing bytes
*/
public void setEOFLookupRange( int byteCount )
{
if ( byteCount > 15 )
{
readTrailBytes = byteCount;
}
}
// ------------------------------------------------------------------------
/**
* The initial parse will first parse only the trailer, the xrefstart and
* all xref tables to have a pointer (offset) to all the pdf's objects.
* It can handle linearized pdfs, which will have an xref at the
* end pointing to an xref at the beginning of the file.
* Last the root object is parsed.
*
* @throws IOException
*/
private void initialParse() throws IOException
{
final long startxrefOff = getStartxrefOffset();
// ---- parse startxref
setPdfSource( startxrefOff );
parseStartXref();
final long xrefOffset = document.getStartXref();
long prev = xrefOffset;
// ---- parse whole chain of xref tables/object streams using PREV reference
while( prev > -1 )
{
// seek to xref table
setPdfSource( prev );
// -- parse xref
if ( pdfSource.peek() == 'x' )
{
// xref table and trailer
// use existing parser to parse xref table
parseXrefTable( prev );
// parse the last trailer.
if ( ! parseTrailer() )
{
throw new IOException( "Expected trailer object at position: " + pdfSource.getOffset() );
}
COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
prev = trailer.getInt( COSName.PREV );
}
else
{
// xref stream
prev = parseXrefObjStream( prev );
}
}
// ---- build valid xrefs out of the xref chain
xrefTrailerResolver.setStartxref( xrefOffset );
document.setTrailer( xrefTrailerResolver.getTrailer() );
// ---- prepare encryption if necessary
COSBase trailerEncryptItem = document.getTrailer().getItem( COSName.ENCRYPT );
if ( trailerEncryptItem != null )
{
if ( trailerEncryptItem instanceof COSObject )
{
COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
parseObjectDynamically( trailerEncryptObj, true );
}
try
{
PDEncryptionDictionary encParameters = new PDEncryptionDictionary( document.getEncryptionDictionary() );
DecryptionMaterial decryptionMaterial = null;
if( keyStoreFilename != null )
{
KeyStore ks = KeyStore.getInstance( "PKCS12" );
ks.load( new FileInputStream( keyStoreFilename ), password.toCharArray() );
decryptionMaterial = new PublicKeyDecryptionMaterial( ks, alias, password );
}
else
{
decryptionMaterial = new StandardDecryptionMaterial( password );
}
securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler( encParameters.getFilter() );
securityHandler.prepareForDecryption( encParameters, document.getDocumentID(), decryptionMaterial );
AccessPermission permission = securityHandler.getCurrentAccessPermission();
if ( ! permission.canExtractContent() )
{
LOG.warn( "PDF file '" + pdfFile.getPath() + "' does not allow extracting content." );
}
}
catch ( Exception e )
{
throw new IOException( "Error (" + e.getClass().getSimpleName() +
") while creating security handler for decryption: " +
e.getMessage() /*, e // TODO: remove remark with Java 1.6 */);
}
}
// ---- parse catalog or root object
COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem( COSName.ROOT );
if ( root == null )
{
throw new IOException( "Missing root object specification in trailer." );
}
parseObjectDynamically( root, false );
// ---- resolve all objects (including pages)
if ( ! parseMinimalCatalog )
{
COSObject catalogObj = document.getCatalog();
if ( catalogObj != null )
{
if ( catalogObj.getObject() instanceof COSDictionary )
{
parseDictObjects( (COSDictionary) catalogObj.getObject(), (COSName[]) null );
allPagesParsed = true;
document.setDecrypted();
}
}
}
initialParseDone = true;
}
// ------------------------------------------------------------------------
/** Parses an xref object stream starting with indirect object id.
*
* @return value of PREV item in dictionary or <code>-1</code> if no such item exists
*/
private long parseXrefObjStream( long objByteOffset ) throws IOException
{
// ---- parse indirect object head
readInt();
readInt();
readPattern( OBJ_MARKER );
COSDictionary dict = parseCOSDictionary();
COSStream xrefStream = parseCOSStream(dict, getDocument().getScratchFile() );
parseXrefStream( xrefStream, (int) objByteOffset );
return dict.getLong( COSName.PREV );
}
// ------------------------------------------------------------------------
/** Get current offset in file at which next byte would be read. */
private final long getPdfSourceOffset()
{
return pdfSource.getOffset();
}
/** Sets {@link #pdfSource} to start next parsing at given file offset. */
private final void setPdfSource( long fileOffset ) throws IOException
{
pdfSource.seek( fileOffset );
// alternative using 'old fashioned' input stream
// if ( pdfSource != null )
// pdfSource.close();
//
// pdfSource = new PushBackInputStream(
// new BufferedInputStream(
// new FileInputStream( file ), 16384), 4096);
// pdfSource.skip( _fileOffset );
}
/** Enable handling of alternative pdfSource implementation. */
private final void releasePdfSourceInputStream() throws IOException
{
// if ( pdfSource != null )
// pdfSource.close();
}
private final void closeFileStream() throws IOException
{
if ( pdfSource != null )
{
pdfSource.close();
}
}
// ------------------------------------------------------------------------
/** Looks for and parses startxref. We first look for last '%%EOF' marker
* (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via
* {@link #setEOFLookupRange(int)}) and go back to find <code>startxref</code>. */
private final long getStartxrefOffset() throws IOException
{
byte[] buf;
long skipBytes;
// ---- read trailing bytes into buffer
final long fileLen = pdfFile.length();
FileInputStream fIn = null;
try
{
fIn = new FileInputStream( pdfFile );
final int trailByteCount = ( fileLen < readTrailBytes ) ? (int) fileLen : readTrailBytes;
buf = new byte[ trailByteCount ];
fIn.skip( skipBytes = fileLen - trailByteCount );
int off = 0;
int readBytes;
while ( off < trailByteCount )
{
readBytes = fIn.read( buf, off, trailByteCount - off );
// in order to not get stuck in a loop we check readBytes (this should never happen)
if ( readBytes < 1 )
{
throw new IOException( "No more bytes to read for trailing buffer, but expected: " +
( trailByteCount - off ) );
}
off += readBytes;
}
}
finally
{
if ( fIn != null )
{
try
{
fIn.close();
}
catch ( IOException ioe )
{}
}
}
// ---- find last '%%EOF'
int bufOff = lastIndexOf( EOF_MARKER, buf, buf.length );
if ( bufOff < 0 )
{
throw new IOException( "Missing end of file marker '" + ( new String( EOF_MARKER ) ) + "'" );
}
// ---- find last startxref preceding EOF marker
bufOff = lastIndexOf( STARTXREF_MARKER, buf, bufOff );
if ( bufOff < 0 )
{
throw new IOException( "Missing 'startxref' marker." );
}
return skipBytes + bufOff;
}
// ------------------------------------------------------------------------
/** Searches last appearance of pattern within buffer. Lookup before _lastOff
* and goes back until 0.
*
* @param pattern pattern to search for
* @param buf buffer to search pattern in
* @param endOff offset (exclusive) where lookup starts at
*
* @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found
*/
private final int lastIndexOf( final char[] pattern, final byte[] buf, final int endOff )
{
final int lastPatternChOff = pattern.length - 1;
int bufOff = endOff;
int patOff = lastPatternChOff;
char lookupCh = pattern[ patOff ];
while ( --bufOff >= 0 )
{
if ( buf[ bufOff ] == lookupCh )
{
if ( --patOff < 0 )
{
// whole pattern matched
return bufOff;
}
// matched current char, advance to preceding one
lookupCh = pattern[ patOff ];
}
else if ( patOff < lastPatternChOff )
{
// no char match but already matched some chars; reset
lookupCh = pattern[ patOff = lastPatternChOff ];
}
}
return -1;
}
// ------------------------------------------------------------------------
/** Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end.
*
* @throws IOException if pattern could not be read
*/
private final void readPattern( final char[] pattern ) throws IOException
{
skipSpaces();
for ( char c : pattern )
{
if ( pdfSource.read() != c )
{
throw new IOException( "Expected pattern '" + new String( pattern ) +
" but missed at character '" + c + "'" );
}
}
skipSpaces();
}
// ------------------------------------------------------------------------
private COSDictionary pagesDictionary = null;
/** Returns PAGES {@link COSDictionary} object or throws {@link IOException}
* if PAGES dictionary does not exist. */
private COSDictionary getPagesObject() throws IOException
{
if ( pagesDictionary != null )
{
return pagesDictionary;
}
COSObject pages = (COSObject) document.getCatalog().getItem( COSName.PAGES );
if ( pages == null )
{
throw new IOException( "Missing PAGES entry in document catalog." );
}
COSBase object = parseObjectDynamically( pages, false );
if ( ! ( object instanceof COSDictionary ) )
{
throw new IOException( "PAGES not a dictionary object, but: " +
object.getClass().getSimpleName() );
}
pagesDictionary = (COSDictionary) object;
return pagesDictionary;
}
// ------------------------------------------------------------------------
/** Parses all objects needed by pages and closes input stream. */
/**
* {@inheritDoc}
*/
@Override
public void parse() throws IOException
{
boolean exceptionOccurred = true; // set to false if all is processed
try
{
if ( ! initialParseDone )
{
initialParse();
}
final int pageCount = getPageNumber();
if ( ! allPagesParsed )
{
for ( int pNr = 0; pNr < pageCount; pNr++ )
{
getPage( pNr );
}
allPagesParsed = true;
document.setDecrypted();
}
exceptionOccurred = false;
}
finally
{
try
{
closeFileStream();
}
catch ( IOException ioe )
{}
if ( exceptionOccurred && ( document != null ) )
{
try
{
document.close();
}
catch ( IOException ioe )
{}
}
}
}
// ------------------------------------------------------------------------
/**
* Returns security handler of the document or <code>null</code> if document
* is not encrypted or {@link #parse()} wasn't called before.
*
* @return the security handler.
*/
public SecurityHandler getSecurityHandler()
{
return securityHandler;
}
// ------------------------------------------------------------------------
/**
* This will get the PD document that was parsed. When you are done with
* this document you must call close() on it to release resources.
*
* Overwriting super method was necessary in order to set security handler.
*
* @return The document at the PD layer.
*
* @throws IOException If there is an error getting the document.
*/
@Override
public PDDocument getPDDocument() throws IOException
{
PDDocument pdDocument = super.getPDDocument();
if ( securityHandler != null )
pdDocument.setSecurityHandler( securityHandler );
return pdDocument;
}
// ------------------------------------------------------------------------
/**
* Returns the number of pages in a document.
*
* @return the number of pages.
*
* @throws IOException if PAGES or other needed object is missing
*/
public int getPageNumber() throws IOException
{
int pageCount = getPagesObject().getInt( COSName.COUNT );
if ( pageCount < 0 )
{
throw new IOException( "No page number specified." );
}
return pageCount;
}
// ------------------------------------------------------------------------
/**
* Returns the page requested with all the objects loaded into it.
*
* @param pageNr starts from 0 to the number of pages.
* @return the page with the given pagenumber.
* @throws IOException If something went wrong.
*/
public PDPage getPage( int pageNr ) throws IOException
{
getPagesObject();
// ---- get list of top level pages
COSArray kids = (COSArray) pagesDictionary.getDictionaryObject( COSName.KIDS );
if ( kids == null )
{
throw new IOException( "Missing 'Kids' entry in pages dictionary." );
}
// ---- get page we are looking for (possibly going recursively into subpages)
COSObject pageObj = getPageObject( pageNr, kids, 0 );
if ( pageObj == null )
{
throw new IOException( "Page " + pageNr + " not found." );
}
// ---- parse all objects necessary to load page.
COSDictionary pageDict = (COSDictionary) pageObj.getObject();
if ( parseMinimalCatalog && ( ! allPagesParsed ) )
{
// parse page resources since we did not do this on start
COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject( COSName.RESOURCES );
parseDictObjects( resDict );
}
return new PDPage( pageDict );
}
/**
* Returns the object for a specific page.
* The page tree is made up of kids. The kids have COSArray with COSObjects
* inside of them. The COSObject can be parsed using the dynamic parsing method
* We want to only parse the minimum COSObjects and still return a complete page.
* ready to be used.
*
* @param num the requested page number; numbering starts with 0
* @param startKids Kids array to start with looking up page number
* @param startPageCount
*
* @return page object or <code>null</code> if no such page exists
*
* @throws IOException
*/
private COSObject getPageObject( int num, COSArray startKids, int startPageCount ) throws IOException
{
int curPageCount = startPageCount;
Iterator<COSBase> kidsIter = startKids.iterator();
while( kidsIter.hasNext() )
{
COSObject obj = (COSObject) kidsIter.next();
COSBase base = obj.getObject();
if( base == null )
{
base = parseObjectDynamically( obj, false );
obj.setObject( base );
}
COSDictionary dic = (COSDictionary) base;
int count = dic.getInt( COSName.COUNT );
if ( count >= 0 )
{
// skip this branch if requested page comes later
if( ( curPageCount + count ) <= num )
{
curPageCount += count;
continue;
}
}
COSArray kids = (COSArray) dic.getDictionaryObject( COSName.KIDS );
if( kids != null)
{
// recursively scan subpages
COSObject ans = getPageObject( num, kids, curPageCount );
// if ans is not null, we got what we were looking for
if( ans != null )
{
return ans;
}
}
else
{
// found page?
if( curPageCount == num )
{
return obj;
}
// page has no kids and it is not the page we are looking for
curPageCount++;
}
}
return null;
}
/** Creates a unique object id using object number and object generation number.
* (requires object number < 2^31)) */
private final long getObjectId( final COSObject obj )
{
return ( obj.getObjectNumber().longValue() << 32 ) | obj.getGenerationNumber().longValue();
}
/** Adds all from newObjects to toBeParsedList if it is not an COSObject
* or we didn't add this COSObject already (checked via addedObjects). */
private final void addNewToList( final Queue<COSBase> toBeParsedList,
final Collection<COSBase> newObjects,
final Set<Long> addedObjects )
{
for ( COSBase newObject : newObjects )
{
if ( newObject instanceof COSObject )
{
final long objId = getObjectId( (COSObject) newObject );
if ( ! addedObjects.add( objId ) )
{
continue;
}
}
toBeParsedList.add( newObject );
}
}
/** Adds newObject to toBeParsedList if it is not an COSObject
* or we didn't add this COSObject already (checked via addedObjects). */
private final void addNewToList( final Queue<COSBase> toBeParsedList,
final COSBase newObject,
final Set<Long> addedObjects )
{
if ( newObject instanceof COSObject )
{
final long objId = getObjectId( (COSObject) newObject );
if ( ! addedObjects.add( objId ) )
{
return;
}
}
toBeParsedList.add( newObject );
}
/**
* Will parse every object necessary to load a single page from the pdf document.
* We try our best to order objects according to offset in file before reading
* to minimize seek operations.
*
* @param dict the COSObject from the parent pages.
* @param excludeObjects dictionary object reference entries with these names will not be parsed
*
* @throws IOException
*/
private void parseDictObjects( COSDictionary dict, COSName... excludeObjects ) throws IOException
{
// ---- create queue for objects waiting for further parsing
final Queue<COSBase> toBeParsedList = new LinkedList<COSBase>();
// offset ordered object map
final TreeMap<Long,List<COSObject>> objToBeParsed = new TreeMap<Long, List<COSObject>>();
// in case of compressed objects offset points to stmObj
final Set<Long> parsedObjects = new HashSet<Long>();
final Set<Long> addedObjects = new HashSet<Long>();
// ---- add objects not to be parsed to list of already parsed objects
if ( excludeObjects != null )
{
for ( COSName objName : excludeObjects )
{
COSBase baseObj = dict.getItem( objName );
if ( baseObj instanceof COSObject )
{
parsedObjects.add( getObjectId( (COSObject) baseObj ) );
}
}
}
addNewToList( toBeParsedList, dict.getValues(), addedObjects );
// ---- go through objects to be parsed
while( ! ( toBeParsedList.isEmpty() && objToBeParsed.isEmpty() ) )
{
// -- first get all COSObject from other kind of objects and
// put them in objToBeParsed; afterwards toBeParsedList is empty
COSBase baseObj;
while ( ( baseObj = toBeParsedList.poll() ) != null )
{
if ( baseObj instanceof COSStream )
{
addNewToList( toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects );
}
else if ( baseObj instanceof COSDictionary )
{
addNewToList( toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects );
}
else if ( baseObj instanceof COSArray )
{
final Iterator<COSBase> arrIter = ( (COSArray) baseObj ).iterator();
while ( arrIter.hasNext() )
{
addNewToList( toBeParsedList, arrIter.next(), addedObjects );
}
}
else if ( baseObj instanceof COSObject )
{
COSObject obj = (COSObject) baseObj;
long objId = getObjectId( obj );
COSObjectKey objKey = new COSObjectKey( obj.getObjectNumber().intValue(),
obj.getGenerationNumber().intValue() );
if ( ! ( parsedObjects.contains( objId ) /*|| document.hasObjectInPool( objKey ) */ ) )
{
Long fileOffset = xrefTrailerResolver.getXrefTable().get( objKey );
// it is allowed that object references point to null, thus we have to test
if ( fileOffset != null )
{
if ( fileOffset > 0 )
{
objToBeParsed.put( fileOffset, Collections.singletonList( obj ) );
}
else
{
// negative offset means we have a compressed object within object stream;
// get offset of object stream
fileOffset = xrefTrailerResolver.getXrefTable().get( new COSObjectKey( -fileOffset, 0 ) );
if ( ( fileOffset == null ) || ( fileOffset <= 0 ) )
{
throw new IOException( "Invalid object stream xref object reference: " + fileOffset );
}
List<COSObject> stmObjects = objToBeParsed.get( fileOffset );
if ( stmObjects == null )
{
objToBeParsed.put( fileOffset, stmObjects = new ArrayList<COSObject>() );
}
stmObjects.add( obj );
}
}
else
{
// NULL object
COSObject pdfObject = document.getObjectFromPool( objKey );
pdfObject.setObject( COSNull.NULL );
}
}
}
}
// ---- read first COSObject with smallest offset;
// resulting object will be added to toBeParsedList
if ( objToBeParsed.isEmpty() )
{
break;
}
for ( COSObject obj : objToBeParsed.remove( objToBeParsed.firstKey() ) )
{
COSBase parsedObj = parseObjectDynamically( obj, false );
obj.setObject( parsedObj );
addNewToList( toBeParsedList, parsedObj, addedObjects );
parsedObjects.add( getObjectId( obj ) );
}
}
}
/**
* This will parse the next object from the stream and add it to
* the local state.
* This is taken from {@link PDFParser} and reduced to parsing
* an indirect object.
*
* @param obj object to be parsed (we only take object number and generation number for lookup start offset)
* @param requireExistingNotCompressedObj if <code>true</code> object to be parsed must
* not be contained within compressed stream
* @return the parsed object (which is also added to document object)
*
* @throws IOException If an IO error occurs.
*/
private COSBase parseObjectDynamically( COSObject obj, boolean requireExistingNotCompressedObj )
throws IOException
{
return parseObjectDynamically( obj.getObjectNumber().intValue(),
obj.getGenerationNumber().intValue(),
requireExistingNotCompressedObj );
}
/**
* This will parse the next object from the stream and add it to
* the local state.
* This is taken from {@link PDFParser} and reduced to parsing
* an indirect object.
*
* @param objNr object number of object to be parsed
* @param objGenNr object generation number of object to be parsed
* @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined
* in xref (comment: null objects may be missing from xref) and
* it must not be a compressed object within object stream
* (this is used to circumvent being stuck in a loop in a malicious PDF)
*
* @return the parsed object (which is also added to document object)
*
* @throws IOException If an IO error occurs.
*/
private COSBase parseObjectDynamically( int objNr, int objGenNr, boolean requireExistingNotCompressedObj )
throws IOException
{
// ---- create object key and get object (container) from pool
final COSObjectKey objKey = new COSObjectKey( objNr, objGenNr );
final COSObject pdfObject = document.getObjectFromPool( objKey );
if ( pdfObject.getObject() == null )
{
// not previously parsed
// ---- read offset or object stream object number from xref table
Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get( objKey );
// sanity test to circumvent loops with broken documents
if ( requireExistingNotCompressedObj &&
( ( offsetOrObjstmObNr == null ) || ( offsetOrObjstmObNr <= 0 ) ) )
{
throw new IOException( "Object must be defined and must not be compressed object: " +
objKey.getNumber() + ":" + objKey.getGeneration() );
}
if ( offsetOrObjstmObNr == null )
{
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
pdfObject.setObject( COSNull.NULL );
}
else if ( offsetOrObjstmObNr > 0 )
{
// offset of indirect object in file
// ---- go to object start
setPdfSource( offsetOrObjstmObNr );
// ---- we must have an indirect object
final int readObjNr = readInt();
final int readObjGen = readInt();
readPattern( OBJ_MARKER );
// ---- consistency check
if ( ( readObjNr != objKey.getNumber() ) ||
( readObjGen != objKey.getGeneration() ) )
{
throw new IOException( "XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() +
" points to wrong object: " + readObjNr + ":" + readObjGen );
}
skipSpaces();
COSBase pb = parseDirObject();
String endObjectKey = readString();
if ( endObjectKey.equals( "stream" ) )
{
pdfSource.unread( endObjectKey.getBytes("ISO-8859-1") );
pdfSource.unread( ' ' );
if( pb instanceof COSDictionary )
{
COSStream stream = parseCOSStream( (COSDictionary)pb,
getDocument().getScratchFile() );
if ( securityHandler != null )
{
try
{
securityHandler.decryptStream(stream, objNr, objGenNr );
}
catch ( CryptographyException ce )
{
throw new IOException( "Error decrypting stream object " + objNr + ": " + ce.getMessage()
/*, ce // TODO: remove remark with Java 1.6 */ );
}
}
pb = stream;
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream forms a complete stream object
throw new IOException( "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")." );
}
skipSpaces();
endObjectKey = readLine();
// we have case with a second 'endstream' before endobj
if ( ! endObjectKey.startsWith( "endobj" ) )
{
if ( endObjectKey.startsWith( "endstream" ) )
{
endObjectKey = endObjectKey.substring( 9 ).trim();
if ( endObjectKey.length() == 0 )
{
// no other characters in extra endstream line
endObjectKey = readLine(); // read next line
}
}
}
}
else if ( securityHandler != null )
{
// decrypt
if ( pb instanceof COSString )
{
decrypt( (COSString) pb, objNr, objGenNr );
}
else if ( pb instanceof COSDictionary )
{
for( Entry<COSName,COSBase> entry : ((COSDictionary) pb).entrySet() )
{
// TODO: specially handle 'Contents' entry of signature dictionary like in SecurityHandler#decryptDictionary
if ( entry.getValue() instanceof COSString )
{
decrypt( (COSString) entry.getValue(), objNr, objGenNr );
}
}
}
else if ( pb instanceof COSArray )
{
final COSArray array = (COSArray) pb;
for( int aIdx = 0, len = array.size(); aIdx < len; aIdx++ )
{
if ( array.get( aIdx ) instanceof COSString )
{
decrypt( (COSString) array.get( aIdx ), objNr, objGenNr );
}
}
}
}
pdfObject.setObject( pb );
if ( ! endObjectKey.startsWith( "endobj" ) )
{
throw new IOException( "Object (" + readObjNr + ":" + readObjGen +
") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'." );
}
releasePdfSourceInputStream();
}
else
{
// xref value is object nr of object stream containing object to be parsed;
// since our object was not found it means object stream was not parsed so far
final int objstmObjNr = (int) ( - offsetOrObjstmObNr );
final COSBase objstmBaseObj = parseObjectDynamically( objstmObjNr, 0, true );
if ( objstmBaseObj instanceof COSStream )
{
// parse object stream
PDFObjectStreamParser parser =
new PDFObjectStreamParser( (COSStream) objstmBaseObj, document, forceParsing );
parser.parse();
// get set of object numbers referenced for this object stream
final Set<Long> refObjNrs = xrefTrailerResolver.getContainedObjectNumbers( objstmObjNr );
// register all objects which are referenced to be contained in object stream
for( COSObject next : parser.getObjects() )
{
COSObjectKey stmObjKey = new COSObjectKey( next );
if ( refObjNrs.contains( stmObjKey.getNumber() ) )
{
COSObject stmObj = document.getObjectFromPool( stmObjKey );
stmObj.setObject( next.getObject() );
}
}
}
}
}
return pdfObject.getObject();
}
// ------------------------------------------------------------------------
/** Decrypts given COSString. */
private final void decrypt( COSString str, long objNr, long objGenNr )
throws IOException
{
try
{
securityHandler.decryptString( str, objNr, objGenNr );
}
catch ( CryptographyException ce )
{
throw new IOException( "Error decrypting string: " + ce.getMessage()
/*, ce // TODO: remove remark with Java 1.6 */ );
}
}
// ------------------------------------------------------------------------
private boolean inGetLength = false;
/** Returns length value referred to or defined in given object. */
private COSNumber getLength( final COSBase lengthBaseObj ) throws IOException
{
if ( lengthBaseObj == null )
{
return null;
}
if ( inGetLength )
{
throw new IOException( "Loop while reading length from " + lengthBaseObj );
}
COSNumber retVal = null;
try
{
inGetLength = true;
// ---- maybe length was given directly
if ( lengthBaseObj instanceof COSNumber )
{
retVal = (COSNumber) lengthBaseObj;
}
// ---- length in referenced object
else if ( lengthBaseObj instanceof COSObject )
{
COSObject lengthObj = (COSObject) lengthBaseObj;
if ( lengthObj.getObject() == null )
{
// not read so far
// keep current stream position
final long curFileOffset = getPdfSourceOffset();
releasePdfSourceInputStream();
parseObjectDynamically( lengthObj, true );
// reset current stream position
setPdfSource( curFileOffset );
if ( lengthObj.getObject() == null )
{
throw new IOException( "Length object content was not read." );
}
}
if ( ! ( lengthObj.getObject() instanceof COSNumber ) )
{
throw new IOException( "Wrong type of referenced length object " + lengthObj + ": " +
lengthObj.getObject().getClass().getSimpleName() );
}
retVal = (COSNumber) lengthObj.getObject();
}
else
{
throw new IOException( "Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName() );
}
}
finally
{
inGetLength = false;
}
return retVal;
}
// ------------------------------------------------------------------------
private final int streamCopyBufLen = 8192;
private final byte[] streamCopyBuf = new byte[ streamCopyBufLen ];
/**
* This will read a COSStream from the input stream using length attribute
* within dictionary.
* If length attribute is a indirect reference it is first resolved to get
* the stream length. This means we copy stream data without testing for
* 'endstream' or 'endobj' and thus it is no problem if these keywords
* occur within stream.
* We require 'endstream' to be found after stream data is read.
*
* @param dic dictionary that goes with this stream.
* @param file file to write the stream to when reading.
*
* @return parsed pdf stream.
*
* @throws IOException if an error occurred reading the stream, like problems
* with reading length attribute, stream does not end with 'endstream'
* after data read, stream too short etc.
*/
@Override
protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException
{
final COSStream stream = new COSStream( dic, file );
OutputStream out = null;
try
{
readString(); // read 'stream'; this was already tested in parseObjectsDynamically()
// ---- skip whitespaces before start of data
// PDF Ref 1.7, chap. 3.2.7:
// 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF but nothing else.
{
int whitespace = pdfSource.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (whitespace == 0x20)
{
whitespace = pdfSource.read();
}
if( whitespace == 0x0D )
{
whitespace = pdfSource.read();
if( whitespace != 0x0A )
{
// the spec says this is invalid but it happens in the real
// world so we must support it
pdfSource.unread( whitespace );
}
}
else if (whitespace != 0x0A)
{
// no whitespace after 'stream'; PDF ref. says 'should' so that is ok
pdfSource.unread( whitespace );
}
}
/*This needs to be dic.getItem because when we are parsing, the underlying object
* might still be null.
*/
COSNumber streamLengthObj = getLength( dic.getItem( COSName.LENGTH ) );
if ( streamLengthObj == null )
{
throw new IOException( "Missing length for stream." );
}
// ---- get output stream to copy data to
out = stream.createFilteredStream( streamLengthObj );
long remainBytes = streamLengthObj.longValue();
while ( remainBytes > 0 )
{
final int readBytes = pdfSource.read( streamCopyBuf, 0,
( remainBytes > streamCopyBufLen ) ? streamCopyBufLen : (int) remainBytes );
if ( readBytes <= 0 )
{
throw new IOException( "No more bytes from stream but expected: " + remainBytes );
}
out.write( streamCopyBuf, 0, readBytes );
remainBytes -= readBytes;
}
String endStream = readString();
if ( ! endStream.equals( "endstream" ) )
{
throw new IOException( "Error reading stream using length value. Expected='endstream' actual='"
+ endStream + "' " );
}
}
finally
{
if ( out != null )
{
out.close();
}
}
return stream;
}
}