Source Code of org.apache.pdfbox.pdfparser.ConformingPDFParser

/*
 *  Copyright 2010 adam.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */


package org.apache.pdfbox.pdfparser;


import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.cos.COSUnread;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.ConformingPDDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.XrefEntry;
import org.apache.pdfbox.persistence.util.COSObjectKey;


/**
 * 
 * @author <a href="adam@apache.org">Adam Nichols</a>
 */
public class ConformingPDFParser extends BaseParser {
    protected RandomAccess inputFile;
    List<XrefEntry> xrefEntries;
    private long currentOffset;
    private ConformingPDDocument doc = null;
    private boolean throwNonConformingException = true;
    private boolean recursivlyRead = true;


    /**
     * Constructor.
     *
     * @param input The input stream that contains the PDF document.
     *
     * @throws IOException If there is an error initializing the stream.
     */
    public ConformingPDFParser(File inputFile) throws IOException {
        this.inputFile = new RandomAccessFile(inputFile, "r");
    }


    /**
     * This will parse the stream and populate the COSDocument object.  This will close
     * the stream when it is done parsing.
     *
     * @throws IOException If there is an error reading from the stream or corrupt data
     * is found.
     */
    public void parse() throws IOException {
        document = new COSDocument();
        doc = new ConformingPDDocument(document);
        currentOffset = inputFile.length()-1;
        long xRefTableLocation = parseTrailerInformation();
        currentOffset = xRefTableLocation;
        parseXrefTable();
        // now that we read the xref table and put null references in the doc,
        // we can deference those objects now.
        boolean oldValue = recursivlyRead;
        recursivlyRead = false;
        List<COSObjectKey> keys = doc.getObjectKeysFromPool();
        for(COSObjectKey key : keys) {
            // getObject will put it into the document's object pool for us
            getObject(key.getNumber(), key.getGeneration());
        }
        recursivlyRead = oldValue;
    }


    /**
     * This will get the document that was parsed.  parse() must be called before this is called.
     * When you are done with this document you must call close() on it to release
     * resources.
     *
     * @return The document that was parsed.
     *
     * @throws IOException If there is an error getting the document.
     */
    public COSDocument getDocument() throws IOException {
        if( document == null ) {
            throw new IOException( "You must call parse() before calling getDocument()" );
        }
        return document;
    }


    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    public PDDocument getPDDocument() throws IOException {
        return doc;
    }
    
    private boolean parseXrefTable() throws IOException {
        String currentLine = readLine();
        if(throwNonConformingException) {
            if(!"xref".equals(currentLine))
                throw new AssertionError("xref table not found.\nExpected: xref\nFound: "+currentLine);
        }


        int objectNumber = readInt();
        int entries = readInt();
        xrefEntries = new ArrayList<XrefEntry>(entries);
        for(int i=0; i<entries; i++)
            xrefEntries.add(new XrefEntry(objectNumber++, readInt(), readInt(), readLine()));
        
        return true;
    }


    protected long parseTrailerInformation() throws IOException, NumberFormatException {
        long xrefLocation = -1;
        consumeWhitespaceBackwards();
        String currentLine = readLineBackwards();
        if(throwNonConformingException) {
            if(!"%%EOF".equals(currentLine))
                throw new AssertionError("Invalid EOF marker.\nExpected: %%EOF\nFound: "+currentLine);
        }


        xrefLocation = readLongBackwards();
        currentLine = readLineBackwards();
        if(throwNonConformingException) {
            if(!"startxref".equals(currentLine))
                throw new AssertionError("Invalid trailer.\nExpected: startxref\nFound: "+currentLine);
        }


        document.setTrailer(readDictionaryBackwards());
        consumeWhitespaceBackwards();
        currentLine = readLineBackwards();
        if(throwNonConformingException) {
            if(!"trailer".equals(currentLine))
                throw new AssertionError("Invalid trailer.\nExpected: trailer\nFound: "+currentLine);
        }


        return xrefLocation;
    }
    
    protected byte readByteBackwards() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        currentOffset--;
        return singleByte;
    }


    protected byte readByte() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        currentOffset++;
        return singleByte;
    }


    protected String readBackwardUntilWhitespace() throws IOException {
        StringBuilder sb = new StringBuilder();
        byte singleByte = readByteBackwards();
        while(!isWhitespace(singleByte)) {
            sb.insert(0, (char)singleByte);
            singleByte = readByteBackwards();
        }
        return sb.toString();
    }


    /**
     * This will read all bytes (backwards) until a non-whitespace character is
     * found.  To save you an extra read, the non-whitespace character is
     * returned.  If the current character is not whitespace, this method will
     * just return the current char.
     * @return the first non-whitespace character found
     * @throws IOException if there is an error reading from the file
     */
    protected byte consumeWhitespaceBackwards() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        if(!isWhitespace(singleByte))
            return singleByte;


        // we have some whitespace, let's consume it
        while(isWhitespace(singleByte)) {
            singleByte = readByteBackwards();
        }
        // readByteBackwards will decrement the currentOffset to point the byte
        // before the one just read, so we increment it back to the current byte
        currentOffset++;
        return singleByte;
    }


    /**
     * This will read all bytes until a non-whitespace character is
     * found.  To save you an extra read, the non-whitespace character is
     * returned.  If the current character is not whitespace, this method will
     * just return the current char.
     * @return the first non-whitespace character found
     * @throws IOException if there is an error reading from the file
     */
    protected byte consumeWhitespace() throws IOException {
        inputFile.seek(currentOffset);
        byte singleByte = (byte)inputFile.read();
        if(!isWhitespace(singleByte))
            return singleByte;


        // we have some whitespace, let's consume it
        while(isWhitespace(singleByte)) {
            singleByte = readByte();
        }
        // readByte() will increment the currentOffset to point the byte
        // after the one just read, so we decrement it back to the current byte
        currentOffset--;
        return singleByte;
    }


    /**
     * This will consume any whitespace, read in bytes until whitespace is found
     * again and then parse the characters which have been read as a long.  The
     * current offset will then point at the first whitespace character which
     * preceeds the number.
     * @return the parsed number
     * @throws IOException if there is an error reading from the file
     * @throws NumberFormatException if the bytes read can not be converted to a number
     */
    protected long readLongBackwards() throws IOException, NumberFormatException {
        StringBuilder sb = new StringBuilder();
        consumeWhitespaceBackwards();
        byte singleByte = readByteBackwards();
        while(!isWhitespace(singleByte)) {
            sb.insert(0, (char)singleByte);
            singleByte = readByteBackwards();
        }
        if(sb.length() == 0)
            throw new AssertionError("Number not found.  Expected number at offset: " + currentOffset);
        return Long.parseLong(sb.toString());
    }


    @Override
    protected int readInt() throws IOException {
        StringBuilder sb = new StringBuilder();
        consumeWhitespace();
        byte singleByte = readByte();
        while(!isWhitespace(singleByte)) {
            sb.append((char)singleByte);
            singleByte = readByte();
        }
        if(sb.length() == 0)
            throw new AssertionError("Number not found.  Expected number at offset: " + currentOffset);
        return Integer.parseInt(sb.toString());
    }


    /**
     * This will read in a number and return the COS version of the number (be
     * it a COSInteger or a COSFloat).
     * @return the COSNumber which was read/parsed
     * @throws IOException
     */
    protected COSNumber readNumber() throws IOException {
        StringBuilder sb = new StringBuilder();
        consumeWhitespace();
        byte singleByte = readByte();
        while(!isWhitespace(singleByte)) {
            sb.append((char)singleByte);
            singleByte = readByte();
        }
        if(sb.length() == 0)
            throw new AssertionError("Number not found.  Expected number at offset: " + currentOffset);
        return parseNumber(sb.toString());
    }


    protected COSNumber parseNumber(String number) throws IOException {
        if(number.matches("^[0-9]+$"))
            return COSInteger.get(number);
        return new COSFloat(Float.parseFloat(number));
    }


    protected COSBase processCosObject(String string) throws IOException {
        if(string != null && string.endsWith(">")) {
            // string of hex codes
            return COSString.createFromHexString(string.replaceAll("^<", "").replaceAll(">$", ""));
        }
        return null;
    }


    protected COSBase readObjectBackwards() throws IOException {
        COSBase obj = null;
        consumeWhitespaceBackwards();
        String lastSection = readBackwardUntilWhitespace();
        if("R".equals(lastSection)) {
            // indirect reference
            long gen = readLongBackwards();
            long number = readLongBackwards();
            // We just put a placeholder in the pool for now, we'll read the data later
            doc.putObjectInPool(new COSUnread(), number, gen);
            obj = new COSUnread(number, gen, this);
        } else if(">>".equals(lastSection)) {
            // dictionary
            throw new RuntimeException("Not yet implemented");
        } else if(lastSection != null && lastSection.endsWith("]")) {
            // array
            COSArray array = new COSArray();
            lastSection = lastSection.replaceAll("]$", "");
            while(!lastSection.startsWith("[")) {
                if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string
                    array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", "")));
                lastSection = readBackwardUntilWhitespace();
            }
            lastSection = lastSection.replaceAll("^\\[", "");
            if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string
                array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", "")));
            obj = array;
        } else if(lastSection != null && lastSection.endsWith(">")) {
            // string of hex codes
            obj = processCosObject(lastSection);
        } else {
            // try a number, otherwise fall back on a string
            try {
                Long.parseLong(lastSection);
                obj = COSNumber.get(lastSection);
            } catch(NumberFormatException e) {
                throw new RuntimeException("Not yet implemented");
            }
        }


        return obj;
    }


    protected COSName readNameBackwards() throws IOException {
        String name = readBackwardUntilWhitespace();
        name = name.replaceAll("^/", "");
        return COSName.getPDFName(name);
    }


    public COSBase getObject(long objectNumber, long generation) throws IOException {
        // we could optionally, check to see if parse() have been called &
        // throw an exception here, but I don't think that's really necessary
        XrefEntry entry = xrefEntries.get((int)objectNumber);
        currentOffset = entry.getByteOffset();
        return readObject(objectNumber, generation);
    }


    /**
     * This will read an object from the inputFile at whatever our currentOffset
     * is.  If the object and generation are not the expected values and this
     * object is set to throw an exception for non-conforming documents, then an
     * exception will be thrown.
     * @param objectNumber the object number you expect to read
     * @param generation the generation you expect this object to be
     * @return
     */
    public COSBase readObject(long objectNumber, long generation) throws IOException {
        // when recursivly reading, we always pull the object from the filesystem
        if(document != null && recursivlyRead) {
            // check to see if it is in the document cache before hitting the filesystem
            COSBase obj = doc.getObjectFromPool(objectNumber, generation);
            if(obj != null)
                return obj;
        }


        int actualObjectNumber = readInt();
        if(objectNumber != actualObjectNumber)
            if(throwNonConformingException)
                throw new AssertionError("Object numer expected was " +
                        objectNumber + " but actual was " + actualObjectNumber);
        consumeWhitespace();


        int actualGeneration = readInt();
        if(generation != actualGeneration)
            if(throwNonConformingException)
                throw new AssertionError("Generation expected was " +
                        generation + " but actual was " + actualGeneration);
        consumeWhitespace();


        String obj = readWord();
        if(!"obj".equals(obj))
            if(throwNonConformingException)
                throw new AssertionError("Expected keyword 'obj' but found " + obj);
        
        // put placeholder object in doc to prevent infinite recursion
        // e.g. read Root -> dereference object -> read object which has /Parent -> GOTO read Root
        doc.putObjectInPool(new COSObject(null), objectNumber, generation);
        COSBase object = readObject();
        doc.putObjectInPool(object, objectNumber, generation);
        return object;
    }


    /**
     * This actually reads the object data.
     * @return the object which is read
     * @throws IOException
     */
    protected COSBase readObject() throws IOException {
        consumeWhitespace();
        String string = readWord();
        if(string.startsWith("<<")) {
            // this is a dictionary
            COSDictionary dictionary = new COSDictionary();
            boolean atEndOfDictionary = false;
            // remove the marker for the beginning of the dictionary
            string = string.replaceAll("^<<", "");
            
            if("".equals(string) || string.matches("^\\w$"))
                string = readWord().trim();
            while(!atEndOfDictionary) {
                COSName name = COSName.getPDFName(string);
                COSBase object = readObject();
                dictionary.setItem(name, object);


                byte singleByte = consumeWhitespace();
                if(singleByte == '>') {
                    readByte(); // get rid of the second '>'
                    atEndOfDictionary = true;
                }
                if(!atEndOfDictionary)
                    string = readWord().trim();
            }
            return dictionary;
        } else if(string.startsWith("/")) {
            // it's a dictionary label. i.e. /Type or /Pages or something similar
            COSBase name = COSName.getPDFName(string);
            return name;
        } else if(string.startsWith("-")) {
            // it's a negitive number
            return parseNumber(string);
        } else if(string.charAt(0) >= '0' && string.charAt(0) <= '9' ) {
            // it's a COSInt or COSFloat, or a weak reference (i.e. "3 0 R")
            // we'll have to peek ahead a little to see if it's a reference or not
            long tempOffset = this.currentOffset;
            consumeWhitespace();
            String tempString = readWord();
            if(tempString.matches("^[0-9]+$")) {
                // it is an int, might be a weak reference...
                tempString = readWord();
                if(!"R".equals(tempString)) {
                    // it's just a number, not a weak reference
                    this.currentOffset = tempOffset;
                    return parseNumber(string);
                }
            } else {
                // it's just a number, not a weak reference
                this.currentOffset = tempOffset;
                return parseNumber(string);
            }


            // it wasn't a number, so we need to parse the weak-reference
            this.currentOffset = tempOffset;
            int number = Integer.parseInt(string);
            int gen = readInt();
            String r = readWord();


            if(!"R".equals(r))
                if(throwNonConformingException)
                    throw new AssertionError("Expected keyword 'R' but found " + r);


            if(recursivlyRead) {
                // seek to the object, read it, seek back to current location
                long tempLocation = this.currentOffset;
                this.currentOffset = this.xrefEntries.get(number).getByteOffset();
                COSBase returnValue = readObject(number, gen);
                this.currentOffset = tempLocation;
                return returnValue;
            } else {
                // Put a COSUnknown there as a placeholder
                COSObject obj = new COSObject(new COSUnread());
                obj.setObjectNumber(COSInteger.get(number));
                obj.setGenerationNumber(COSInteger.get(gen));
                return obj;
            }
        } else if(string.startsWith("]")) {
            // end of an array, just return null
            if("]".equals(string))
                return null;
            int oldLength = string.length();
            this.currentOffset -= oldLength;
            return null;
        } else if(string.startsWith("[")) {
            // array of values
            // we'll just pay attention to the first part (this is in case there
            // is no whitespace between the "[" and the first element)
            int oldLength = string.length();
            string = "[";
            this.currentOffset -= (oldLength - string.length() + 1);


            COSArray array = new COSArray();
            COSBase object = readObject();
            while(object != null) {
                array.add(object);
                object = readObject();
            }
            return array;
        } else if(string.startsWith("(")) {
            // this is a string (not hex encoded), strip off the '(' and read until ')'
            StringBuilder sb = new StringBuilder(string.substring(1));
            byte singleByte = readByte();
            while(singleByte != ')') {
                sb.append((char)singleByte);
                singleByte = readByte();
            }
            return new COSString(sb.toString());
        } else {
            throw new RuntimeException("Not yet implemented: " + string
                    + " loation=" + this.currentOffset);
        }
    }


    /**
     * This will read the next string from the stream.
     * @return The string that was read from the stream.
     * @throws IOException If there is an error reading from the stream.
     */
    @Override
    protected String readString() throws IOException {
        consumeWhitespace();
        StringBuilder buffer = new StringBuilder();
        int c = pdfSource.read();
        while(!isEndOfName((char)c) && !isClosing(c) && c != -1) {
            buffer.append( (char)c );
            c = pdfSource.read();
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return buffer.toString();
    }


    protected COSDictionary readDictionaryBackwards() throws IOException {
        COSDictionary dict = new COSDictionary();
        
        // consume the last two '>' chars which signify the end of the dictionary
        consumeWhitespaceBackwards();
        byte singleByte = readByteBackwards();
        if(throwNonConformingException) {
            if(singleByte != '>')
                throw new AssertionError("");
        }
        singleByte = readByteBackwards();
        if(throwNonConformingException) {
            if(singleByte != '>')
                throw new AssertionError("");
        }
        
        // check to see if we're at the end of the dictionary
        boolean atEndOfDictionary = false;
        singleByte = consumeWhitespaceBackwards();
        if(singleByte == '<') {
            inputFile.seek(currentOffset-1);
            atEndOfDictionary =  ((byte)inputFile.read()) == '<';
        }


        COSDictionary backwardsDictionary = new COSDictionary();
        // while we're not at the end of the dictionary, read in entries
        while(!atEndOfDictionary) {
            COSBase object = readObjectBackwards();
            COSName name = readNameBackwards();
            backwardsDictionary.setItem(name, object);
            
            singleByte = consumeWhitespaceBackwards();
            if(singleByte == '<') {
                inputFile.seek(currentOffset-1);
                atEndOfDictionary =  ((byte)inputFile.read()) == '<';
            }
        }


        // the dictionaries preserve the order keys were added, as such we shall
        // add them in the proper order, not the reverse order
        Set<COSName> backwardsKeys = backwardsDictionary.keySet();
        for(int i = backwardsKeys.size()-1; i >=0; i--)
            dict.setItem((COSName)backwardsKeys.toArray()[i], backwardsDictionary.getItem((COSName)backwardsKeys.toArray()[i]));
        
        // consume the last two '<' chars
        readByteBackwards();
        readByteBackwards();


        return dict;
    }


    /**
     * This will read a line starting with the byte at offset and going 
     * backwards until it finds a newline.  This should only be used if we are
     * certain that the data will only be text, and not binary data.
     * 
     * @param offset the location of the file where we should start reading
     * @return the string which was read
     * @throws IOException if there was an error reading data from the file
     */
    protected String readLineBackwards() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean endOfObject = false;
        
        do {
            // first we read the %%EOF marker
            byte singleByte = readByteBackwards();
            if(singleByte == '\n') {
                // if ther's a preceeding \r, we'll eat that as well
                inputFile.seek(currentOffset);
                if((byte)inputFile.read() == '\r')
                    currentOffset--;
                endOfObject = true;
            } else if(singleByte == '\r') {
                endOfObject = true;
            } else {
                sb.insert(0, (char)singleByte);
            }
        } while(!endOfObject);
        
        return sb.toString();
    }


    /**
     * This will read a line starting with the byte at offset and going
     * forward until it finds a newline.  This should only be used if we are
     * certain that the data will only be text, and not binary data.
     * @param offset the location of the file where we should start reading
     * @return the string which was read
     * @throws IOException if there was an error reading data from the file
     */
    @Override
    protected String readLine() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean endOfLine = false;


        do {
            // first we read the %%EOF marker
            byte singleByte = readByte();
            if(singleByte == '\n') {
                // if ther's a preceeding \r, we'll eat that as well
                inputFile.seek(currentOffset);
                if((byte)inputFile.read() == '\r')
                    currentOffset++;
                endOfLine = true;
            } else if(singleByte == '\r') {
                endOfLine = true;
            } else {
                sb.append((char)singleByte);
            }
        } while(!endOfLine);


        return sb.toString();
    }


    protected String readWord() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean stop = true;
        do {
            byte singleByte = readByte();
            stop = this.isWhitespace(singleByte);


            // there are some additional characters which indicate the next element/word has begun
            // ignore the first char we read, b/c the first char is the beginnging of this object, not the next one
            if(!stop && sb.length() > 0) {
                stop = singleByte == '/' || singleByte == '['
                        || singleByte == ']'
                        || (singleByte == '>' && !">".equals(sb.toString()));
                if(stop) // we're stopping on a non-whitespace char, decrement the
                    this.currentOffset--; // counter so we don't miss this character
            }
            if(!stop)
                sb.append((char)singleByte);
        } while(!stop);


        return sb.toString();
    }


    /**
     * @return the recursivlyRead
     */
    public boolean isRecursivlyRead() {
        return recursivlyRead;
    }


    /**
     * @param recursivlyRead the recursivlyRead to set
     */
    public void setRecursivlyRead(boolean recursivlyRead) {
        this.recursivlyRead = recursivlyRead;
    }
}
Source Code of org.apache.pdfbox.pdfparser.ConformingPDFParser

Related Classes of org.apache.pdfbox.pdfparser.ConformingPDFParser