/**
* Copyright (c) 2003, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.util;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.pdfbox.cos.COSArray;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSInteger;
import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.PDResources;
import org.pdfbox.encryption.DecryptDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.apache.log4j.Category;
/**
* This class will take a pdf document and strip out all of the text and ignore the
* formatting and such
*
* @author Ben Litchfield (ben@csh.rit.edu)
* @version $Revision: 1.33 $
*/
public class PDFTextStripper extends PDFStreamEngine
{
private static Category log = Category.getInstance(PDFTextStripper.class.getName());
private static final int BUFFER_SIZE = 1024;
private int currentPage = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDDocument document;
private List charactersList = new ArrayList();
private Writer output;
private String lineSeparator = System.getProperty("line.separator");
private String pageSeparator = System.getProperty("line.separator");
/**
* This will return the text of a document. See writeText. <br />
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
*
* @return The text of the PDF document.
*
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public String getText( PDDocument doc ) throws IOException
{
StringWriter outputStream = new StringWriter();
writeText( doc, outputStream );
return outputStream.toString();
}
/**
* @deprecated
* @see getText( PDDocument )
*/
public String getText( COSDocument doc ) throws IOException
{
PDDocument document = new PDDocument( doc );
return getText( document );
}
/**
* @deprecated
* @see writeText( PDDocument, Writer )
*/
public void writeText( COSDocument doc, Writer outputStream ) throws IOException
{
PDDocument document = new PDDocument( doc );
writeText( document, outputStream );
}
/**
* This will take a PDDocument and write the text of that document to the print writer.
*
* @param doc The document to get the data from.
* @param outputStream The location to put the text.
*
* @throws IOException If the doc is in an invalid state.
*/
public void writeText( PDDocument doc, Writer outputStream ) throws IOException
{
COSDictionary encrypted = doc.getDocument().getEncryptionDictionary();
if( encrypted != null )
{
COSInteger p = (COSInteger)encrypted.getDictionaryObject( COSName.getPDFName( "P" ) );
long pVal = p.intValue();
if( (pVal & 16) == 0 )
{
throw new IOException( "You do not have permission to extract text" );
}
}
currentPage = 0;
document = null;
document = doc;
output = outputStream;
if( document.isEncrypted() )
{
// We are expecting non-encrypted documents here, but it is common
// for users to pass in a document that is encrypted with an empty
// password (such a document appears to not be encrypted by
// someone viewing the document, thus the confusion). We will
// attempt to decrypt with the empty password to handle this case.
//
log.debug("Document is encrypted, decrypting with empty password");
DecryptDocument decryptor = new DecryptDocument( document );
try
{
decryptor.decryptDocument("");
}
catch (CryptographyException e)
{
throw new IOException("Error decrypting document, details: " + e.getMessage());
}
catch (InvalidPasswordException e)
{
throw new IOException("Error: document is encrypted");
}
}
processPages( document.getDocumentCatalog().getAllPages() );
}
/**
* This will process all of the pages and the text that is in them.
*
* @param pages The pages object in the document.
*
* @throws IOException If there is an error parsing the text.
*/
protected void processPages( List pages ) throws IOException
{
if( log.isDebugEnabled() )
{
log.debug( "processPages( " + pages + " )" );
}
Iterator pageIter = pages.iterator();
while( pageIter.hasNext() )
{
PDPage page = (PDPage)pageIter.next();
COSBase contents = page.getCOSDictionary().getDictionaryObject( COSName.CONTENTS );
if( contents != null )
{
//clear the text matrix for every page.
if( contents instanceof COSStream )
{
processPage( page, (COSStream)contents );
}
else if( contents instanceof COSArray )
{
COSArray contentsArray = (COSArray)contents;
byte[] buffer = new byte[ BUFFER_SIZE ];
int amountRead = 0;
if( contentsArray.size() > 0 )
{
COSObject first = (COSObject)contentsArray.get( 0 );
COSStream firstStream = (COSStream)first.getObject();
COSStream tmpStream =
new COSStream( firstStream.getDictionary(),
firstStream.getScratchFile() );
OutputStream output = tmpStream.createUnfilteredStream();
for( int i=0; i<contentsArray.size(); i++ )
{
COSObject obj = (COSObject)contentsArray.get( i );
COSStream content = (COSStream)obj.getObject();
InputStream input = content.getUnfilteredStream();
while ( (amountRead = input.read(buffer, 0, BUFFER_SIZE)) != -1)
{
output.write(buffer, 0, amountRead);
}
//handle the case where there is no whitespace in the
//between streams in the contents array, without this
//it is possible that two operators will get concatenated
//together
output.write( "\n".getBytes() );
}
processPage( page, tmpStream );
}
}
else
{
throw new IOException( "Contents are unknown type:" + contents.getClass().getName() );
}
}
}
log.debug( "processPages() end" );
}
/**
* This will process the contents of a page.
*
* @param page The page to process.
* @param content The contents of the page.
*
* @throws IOException If there is an error processing the page.
*/
protected void processPage( PDPage page, COSStream content ) throws IOException
{
long start = System.currentTimeMillis();
if( log.isDebugEnabled() )
{
log.debug( "processPage( " + page + ", " + content + " )" );
}
PDResources resources = page.findResources();
Map fonts = null;
if( resources != null )
{
fonts = resources.getFonts();
}
currentPage++;
if( currentPage >= startPage && currentPage <= endPage )
{
charactersList.clear();
long startProcess = System.currentTimeMillis();
processStream( content, fonts );
long stopProcess = System.currentTimeMillis();
long startFlush = System.currentTimeMillis();
flushText();
long stopFlush = System.currentTimeMillis();
if( log.isDebugEnabled() )
{
log.debug( "processStream time=" + (stopProcess-startProcess) );
log.debug( "flushText time=" + (stopFlush-startFlush) );
}
}
long stop = System.currentTimeMillis();
if( log.isDebugEnabled() )
{
log.debug( "processPage() end time=" + (stop-start) );
}
}
/**
* This will print the text to the output stream.
*
* @throws IOException If there is an error writing the text.
*/
protected void flushText() throws IOException
{
log.debug( "flushText() start" );
float currentY = -1;
float lastBaselineFontSize = -1;
log.debug("<Starting text object list>");
float endOfLastTextX = -1;
float startOfNextWordX = -1;
Iterator textIter = charactersList.iterator();
while( textIter.hasNext() )
{
TextPosition position = (TextPosition)textIter.next();
// RDD - We will suppress text that is very close to the current line
// and which overwrites previously rendered text on this line.
// This is done specifically to handle a reasonably common situation
// where an application (MS Word, in the case of my examples) renders
// text four times at small (1 point) offsets in order to accomplish
// bold printing. You would not want to do this step if you were
// going to render the TextPosition objects graphically.
//
if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
(currentY != -1 && Math.abs(position.getY() - currentY) < 1))
{
if (log.isDebugEnabled())
{
log.debug("Suppressing text overwrite" +
" x: " + position.getX() +
" endOfLastTextX: " + endOfLastTextX +
" string: " + position.getString());
}
continue;
}
// RDD - Here we determine whether this text object is on the current
// line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
// Text must overlap with the last rendered baseline text by at least
// a small amount in order to be considered as being on the same line.
//
if (currentY != -1 &&
((position.getY() > (currentY + (lastBaselineFontSize * 0.9f))) ||
(position.getY() < (currentY - (position.getFontSize() * 0.9f)))))
{
if (log.isDebugEnabled())
{
log.debug("<newline currentY=" + currentY + ", y=" + position.getY() + ">");
}
output.write(lineSeparator);
endOfLastTextX = -1;
startOfNextWordX = -1;
currentY = -1;
lastBaselineFontSize = -1;
}
if (startOfNextWordX != -1 && startOfNextWordX < position.getX())
{
if (log.isDebugEnabled())
{
log.debug("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">");
}
output.write(0x20);
}
if (log.isDebugEnabled())
{
log.debug("flushText" +
" x=" + position.getX() +
" y=" + position.getY() +
" width=" + position.getWidth() +
" currentY=" + currentY +
" endOfLastTextX=" + endOfLastTextX +
" startOfNextWordX=" + startOfNextWordX +
" fontSize=" + position.getFontSize() +
" string=\"" + position.getString() + "\"");
}
if (currentY == -1)
{
currentY = position.getY();
}
if (currentY == position.getY())
{
lastBaselineFontSize = position.getFontSize();
}
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
//
float endX = position.getX() + position.getWidth();
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme amounts
// of padding are applied, then backed off (not sure why this is done, but there
// are cases where the padding is on the order of 10x the character width, and
// the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
//
float textTrailingSpaceWidth = 0;
for (int n = position.getString().length()-1; n >= 0 && position.getString().charAt(n) == ' '; n--)
{
textTrailingSpaceWidth += position.getWordSpacing();
}
endOfLastTextX = endX -
Math.max(position.getPadding(), 0) -
textTrailingSpaceWidth -
(0.4f * position.getLastCharWidth());
// RDD - We add a conservative approximation for space determination.
//
startOfNextWordX = endX + (position.getWordSpacing() * 0.30f);
if (position.getString() != null)
{
output.write(position.getString());
}
}
// RDD - newline at end of flush - required for end of page (so that the top
// of the next page starts on its own line.
//
log.debug("<newline endOfFlush=\"true\">");
output.write(pageSeparator);
output.flush();
}
/**
* This will determine of two floating point numbers are within a specified variance.
*
* @param first The first number to compare to.
* @param second The second number to compare to.
* @param variance The allowed variance.
*/
private boolean within( float first, float second, float variance )
{
float firstMin = first - variance;
float firstMax = first + variance;
return second > firstMin && second < firstMax;
}
/**
* This will determine of two floating point numbers are equal within a small amount of variance.
*
* @param first The first number to compare to.
* @param second The second number to compare to.
*/
private boolean equals( float first, float second )
{
return within( first, second, 0.001f);
}
/**
* This will show a string.
*
* @param string The string to show.
*
* @return A description of the text being shown.
*
* @throws IOException If there is an error showing the string.
*/
protected TextPosition showString( byte[] string ) throws IOException
{
TextPosition position = super.showString( string );
charactersList.add( position );
return position;
}
/**
* This is the page that the text extraction will start on. The pages start
* at page 1. For example in a 5 page PDF document, if the start page is 1
* then all pages will be extracted. If the start page is 4 then pages 4 and 5
* will be extracted. The default value is 1.
*
* @return Value of property startPage.
*/
public int getStartPage()
{
return startPage;
}
/**
* This will set the first page to be extracted by this class.
*
* @param startPage New value of property startPage.
*/
public void setStartPage(int startPage)
{
this.startPage = startPage;
}
/**
* This will get the last page that will be extracted. This is inclusive,
* for example if a 5 page PDF an endPage value of 5 would extract the
* entire document, an end page of 2 would extract pages 1 and 2. This defaults
* to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
*
* @return Value of property endPage.
*/
public int getEndPage()
{
return endPage;
}
/**
* This will set the last page to be extracted by this class.
*
* @param endPage New value of property endPage.
*/
public void setEndPage(int endPage)
{
this.endPage = endPage;
}
/**
* Set the desired line separator for output text. The line.separator
* system property is used if the line separator preference is not set
* explicitly using this method.
*
* @param separator The desired line separator string.
*/
public void setLineSeparator(String separator)
{
lineSeparator = separator;
}
/**
* This will get the line separator.
*
* @return The desired line separator string.
*/
public String getLineSeparator()
{
return lineSeparator;
}
/**
* Set the desired page separator for output text. The line.separator
* system property is used if the page separator preference is not set
* explicitly using this method.
*
* @param separator The desired page separator string.
*/
public void setPageSeparator(String separator)
{
pageSeparator = separator;
}
/**
* This will get the page separator.
*
* @return The page separator string.
*/
public String getPageSeparator()
{
return pageSeparator;
}
}