Source Code of org.pdfbox.util.PDFTextStripper

/**
 * Copyright (c) 2003, www.pdfbox.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of pdfbox; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://www.pdfbox.org
 *
 */
package org.pdfbox.util;


import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringWriter;
import java.io.Writer;


import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import org.pdfbox.cos.COSArray;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSInteger;
import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;


import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.PDResources;


import org.pdfbox.encryption.DecryptDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;


import org.apache.log4j.Category;




/**
 * This class will take a pdf document and strip out all of the text and ignore the
 * formatting and such
 *
 * @author Ben Litchfield (ben@csh.rit.edu)
 * @version $Revision: 1.33 $
 */
public class PDFTextStripper extends PDFStreamEngine
{
    private static Category log = Category.getInstance(PDFTextStripper.class.getName());
    private static final int BUFFER_SIZE = 1024;




    private int currentPage = 0;
    private int startPage = 1;
    private int endPage = Integer.MAX_VALUE;
    private PDDocument document;
    private List charactersList = new ArrayList();
    private Writer output;
    private String lineSeparator = System.getProperty("line.separator");
    private String pageSeparator = System.getProperty("line.separator");


    /**
     * This will return the text of a document.  See writeText. <br />
     * NOTE: The document must not be encrypted when coming into this method.
     *
     * @param doc The document to get the text from.
     *
     * @return The text of the PDF document.
     *
     * @throws IOException if the doc state is invalid or it is encrypted.
     */
    public String getText( PDDocument doc ) throws IOException
    {
        StringWriter outputStream = new StringWriter();
        writeText( doc, outputStream );
        return outputStream.toString();
    }


    /**
     * @deprecated
     * @see getText( PDDocument )
     */
    public String getText( COSDocument doc ) throws IOException
    {
        PDDocument document = new PDDocument( doc );
        return getText( document );
    }


    /**
     * @deprecated
     * @see writeText( PDDocument, Writer )
     */
    public void writeText( COSDocument doc, Writer outputStream ) throws IOException
    {
        PDDocument document = new PDDocument( doc );
        writeText( document, outputStream );
    }


    /**
     * This will take a PDDocument and write the text of that document to the print writer.
     *
     * @param doc The document to get the data from.
     * @param outputStream The location to put the text.
     *
     * @throws IOException If the doc is in an invalid state.
     */
    public void writeText( PDDocument doc, Writer outputStream ) throws IOException
    {
        COSDictionary encrypted = doc.getDocument().getEncryptionDictionary();
        if( encrypted != null )
        {
            COSInteger p = (COSInteger)encrypted.getDictionaryObject( COSName.getPDFName( "P" ) );
            long pVal = p.intValue();
            if( (pVal & 16) == 0 )
            {
                throw new IOException( "You do not have permission to extract text" );
            }
        }
        currentPage = 0;
        document = null;
        document = doc;
        output = outputStream;


        if( document.isEncrypted() )
        {
            // We are expecting non-encrypted documents here, but it is common
            // for users to pass in a document that is encrypted with an empty
            // password (such a document appears to not be encrypted by
            // someone viewing the document, thus the confusion).  We will
            // attempt to decrypt with the empty password to handle this case.
            //
            log.debug("Document is encrypted, decrypting with empty password");
            DecryptDocument decryptor = new DecryptDocument( document );
            try
            {
                decryptor.decryptDocument("");
            }
            catch (CryptographyException e)
            {
                throw new IOException("Error decrypting document, details: " + e.getMessage());
            }
            catch (InvalidPasswordException e)
            {
                throw new IOException("Error: document is encrypted");
            }
        }


        processPages( document.getDocumentCatalog().getAllPages() );
    }


    /**
     * This will process all of the pages and the text that is in them.
     *
     * @param pages The pages object in the document.
     *
     * @throws IOException If there is an error parsing the text.
     */
    protected void processPages( List pages ) throws IOException
    {
        if( log.isDebugEnabled() )
        {
            log.debug( "processPages( " + pages + " )" );
        }


        Iterator pageIter = pages.iterator();
        while( pageIter.hasNext() )
        {
            PDPage page = (PDPage)pageIter.next();


            COSBase contents = page.getCOSDictionary().getDictionaryObject( COSName.CONTENTS );
            if( contents != null )
            {
                //clear the text matrix for every page.
                if( contents instanceof COSStream )
                {
                    processPage( page, (COSStream)contents );
                }
                else if( contents instanceof COSArray )
                {
                    COSArray contentsArray = (COSArray)contents;
                    byte[] buffer = new byte[ BUFFER_SIZE ];
                    int amountRead = 0;
                    if( contentsArray.size() > 0 )
                    {
                        COSObject first = (COSObject)contentsArray.get( 0 );
                        COSStream firstStream = (COSStream)first.getObject();
                        COSStream tmpStream =
                            new COSStream( firstStream.getDictionary(),
                                           firstStream.getScratchFile() );
                        OutputStream output = tmpStream.createUnfilteredStream();
                        for( int i=0; i<contentsArray.size(); i++ )
                        {
                            COSObject obj = (COSObject)contentsArray.get( i );
                            COSStream content = (COSStream)obj.getObject();
                            InputStream input = content.getUnfilteredStream();
                            while ( (amountRead = input.read(buffer, 0, BUFFER_SIZE)) != -1)
                            {
                                output.write(buffer, 0, amountRead);
                            }
                            //handle the case where there is no whitespace in the
                            //between streams in the contents array, without this
                            //it is possible that two operators will get concatenated
                            //together
                            output.write( "\n".getBytes() );
                        }
                        processPage( page, tmpStream );
                    }
                }
                else
                {
                    throw new IOException( "Contents are unknown type:" + contents.getClass().getName() );
                }
            }
        }
        log.debug( "processPages() end" );
    }


    /**
     * This will process the contents of a page.
     *
     * @param page The page to process.
     * @param content The contents of the page.
     *
     * @throws IOException If there is an error processing the page.
     */
    protected void processPage( PDPage page, COSStream content ) throws IOException
    {
        long start = System.currentTimeMillis();
        if( log.isDebugEnabled() )
        {
            log.debug( "processPage( " + page + ", " + content + " )" );
        }
        PDResources resources = page.findResources();
        Map fonts = null;
        if( resources != null )
        {
            fonts = resources.getFonts();
        }
        currentPage++;
        if( currentPage >= startPage && currentPage <= endPage )
        {
            charactersList.clear();
            long startProcess = System.currentTimeMillis();
            processStream( content, fonts );
            long stopProcess = System.currentTimeMillis();
            long startFlush = System.currentTimeMillis();
            flushText();
            long stopFlush = System.currentTimeMillis();
            if( log.isDebugEnabled() )
            {
                log.debug( "processStream time=" + (stopProcess-startProcess) );
                log.debug( "flushText time=" + (stopFlush-startFlush) );
            }
        }
        long stop = System.currentTimeMillis();
        if( log.isDebugEnabled() )
        {
            log.debug( "processPage() end time=" + (stop-start) );
        }




    }


    /**
     * This will print the text to the output stream.
     *
     * @throws IOException If there is an error writing the text.
     */
    protected void flushText() throws IOException
    {
        log.debug( "flushText() start" );
        float currentY = -1;
        float lastBaselineFontSize = -1;
        log.debug("<Starting text object list>");
        float endOfLastTextX = -1;
        float startOfNextWordX = -1;
        Iterator textIter = charactersList.iterator();
        while( textIter.hasNext() )
        {
            TextPosition position = (TextPosition)textIter.next();


            // RDD - We will suppress text that is very close to the current line
            // and which overwrites previously rendered text on this line.
            // This is done specifically to handle a reasonably common situation
            // where an application (MS Word, in the case of my examples) renders
            // text four times at small (1 point) offsets in order to accomplish
            // bold printing.  You would not want to do this step if you were
            // going to render the TextPosition objects graphically.
            //
            if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
                (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
            {
                if (log.isDebugEnabled())
                {
                    log.debug("Suppressing text overwrite" +
                              " x: " + position.getX() +
                              " endOfLastTextX: " + endOfLastTextX +
                              " string: " + position.getString());
                }
                continue;
            }


            // RDD - Here we determine whether this text object is on the current
            // line.  We use the lastBaselineFontSize to handle the superscript
            // case, and the size of the current font to handle the subscript case.
            // Text must overlap with the last rendered baseline text by at least
            // a small amount in order to be considered as being on the same line.
            //
            if (currentY != -1 &&
                ((position.getY() > (currentY + (lastBaselineFontSize * 0.9f))) ||
                 (position.getY() < (currentY - (position.getFontSize() * 0.9f)))))
            {
                if (log.isDebugEnabled())
                {
                    log.debug("<newline currentY=" + currentY + ", y=" + position.getY() + ">");
                }
                output.write(lineSeparator);
                endOfLastTextX = -1;
                startOfNextWordX = -1;
                currentY = -1;
                lastBaselineFontSize = -1;
            }


            if (startOfNextWordX != -1 && startOfNextWordX < position.getX())
            {
                if (log.isDebugEnabled())
                {
                    log.debug("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">");
                }
                output.write(0x20);
            }


            if (log.isDebugEnabled())
            {
                log.debug("flushText" +
                          " x=" + position.getX() +
                          " y=" + position.getY() +
                          " width=" + position.getWidth() +
                          " currentY=" + currentY +
                          " endOfLastTextX=" + endOfLastTextX +
                          " startOfNextWordX=" + startOfNextWordX +
                          " fontSize=" + position.getFontSize() +
                          " string=\"" + position.getString() + "\"");
            }


            if (currentY == -1)
            {
                currentY = position.getY();
            }


            if (currentY == position.getY())
            {
                lastBaselineFontSize = position.getFontSize();
            }


            // RDD - endX is what PDF considers to be the x coordinate of the
            // end position of the text.  We use it in computing our metrics below.
            //
            float endX = position.getX() + position.getWidth();


            // RDD - Here we compute the value that represents the end of the rendered
            // text.  This value is used to determine whether subsequent text rendered
            // on the same line overwrites the current text.
            //
            // We subtract any positive padding to handle cases where extreme amounts
            // of padding are applied, then backed off (not sure why this is done, but there
            // are cases where the padding is on the order of 10x the character width, and
            // the TJ just backs up to compensate after each character).  Also, we subtract
            // an amount to allow for kerning (a percentage of the width of the last
            // character).
            //
            float textTrailingSpaceWidth = 0;
            for (int n = position.getString().length()-1; n >= 0 && position.getString().charAt(n) == ' '; n--)
            {
                textTrailingSpaceWidth += position.getWordSpacing();
            }
            endOfLastTextX = endX -
                             Math.max(position.getPadding(), 0) -
                             textTrailingSpaceWidth -
                             (0.4f * position.getLastCharWidth());


            // RDD - We add a conservative approximation for space determination.
            //
            startOfNextWordX = endX + (position.getWordSpacing() * 0.30f);


            if (position.getString() != null)
            {
                output.write(position.getString());
            }
        }


        // RDD - newline at end of flush - required for end of page (so that the top
        // of the next page starts on its own line.
        //
        log.debug("<newline endOfFlush=\"true\">");
        output.write(pageSeparator);


        output.flush();
    }


    /**
     * This will determine of two floating point numbers are within a specified variance.
     *
     * @param first The first number to compare to.
     * @param second The second number to compare to.
     * @param variance The allowed variance.
     */
    private boolean within( float first, float second, float variance )
    {
        float firstMin = first - variance;
        float firstMax = first + variance;
        return second > firstMin && second < firstMax;
    }


    /**
     * This will determine of two floating point numbers are equal within a small amount of variance.
     *
     * @param first The first number to compare to.
     * @param second The second number to compare to.
     */
    private boolean equals( float first, float second )
    {
        return within( first, second, 0.001f);
    }


    /**
     * This will show a string.
     *
     * @param string The string to show.
     *
     * @return A description of the text being shown.
     *
     * @throws IOException If there is an error showing the string.
     */
    protected TextPosition showString( byte[] string ) throws IOException
    {
        TextPosition position = super.showString( string );
        charactersList.add( position );
        return position;
    }


    /**
     * This is the page that the text extraction will start on.  The pages start
     * at page 1.  For example in a 5 page PDF document, if the start page is 1
     * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
     * will be extracted.  The default value is 1.
     *
     * @return Value of property startPage.
     */
    public int getStartPage()
    {
        return startPage;
    }


    /**
     * This will set the first page to be extracted by this class.
     *
     * @param startPage New value of property startPage.
     */
    public void setStartPage(int startPage)
    {
        this.startPage = startPage;
    }


    /**
     * This will get the last page that will be extracted.  This is inclusive,
     * for example if a 5 page PDF an endPage value of 5 would extract the
     * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
     * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
     *
     * @return Value of property endPage.
     */
    public int getEndPage()
    {
        return endPage;
    }


    /**
     * This will set the last page to be extracted by this class.
     *
     * @param endPage New value of property endPage.
     */
    public void setEndPage(int endPage)
    {
        this.endPage = endPage;
    }


    /**
     * Set the desired line separator for output text.  The line.separator
     * system property is used if the line separator preference is not set
     * explicitly using this method.
     *
     * @param separator The desired line separator string.
     */
    public void setLineSeparator(String separator)
    {
        lineSeparator = separator;
    }


    /**
     * This will get the line separator.
     *
     * @return The desired line separator string.
     */
    public String getLineSeparator()
    {
        return lineSeparator;
    }


    /**
     * Set the desired page separator for output text.  The line.separator
     * system property is used if the page separator preference is not set
     * explicitly using this method.
     *
     * @param separator The desired page separator string.
     */
    public void setPageSeparator(String separator)
    {
        pageSeparator = separator;
    }


    /**
     * This will get the page separator.
     *
     * @return The page separator string.
     */
    public String getPageSeparator()
    {
        return pageSeparator;
    }
}
Source Code of org.pdfbox.util.PDFTextStripper

Related Classes of org.pdfbox.util.PDFTextStripper