/**
* Copyright (c) 2003, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.pdfparser;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;
import org.pdfbox.cos.COSBoolean;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSNull;
import org.pdfbox.cos.COSNumber;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;
import org.pdfbox.util.PDFOperator;
import org.apache.log4j.Category;
/**
* This will parse a PDF byte stream and extract operands and such.
*
* @author Ben Litchfield (ben@csh.rit.edu)
* @version $Revision: 1.20 $
*/
public class PDFStreamParser extends BaseParser
{
private static Category log = Category.getInstance(PDFStreamParser.class.getName());
private List streamObjects = new ArrayList( 100 );
private RandomAccessFile file;
/**
* Constructor that takes a stream to parse.
*
* @param stream The stream to read data from.
* @param raf The random access file.
*
* @throws IOException If there is an error reading from the stream.
*/
public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException
{
super( stream );
file = raf;
}
/**
* constructor
*
* @param stream The stream to parse.
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFStreamParser( COSStream stream ) throws IOException
{
this( stream.getUnfilteredStream(), stream.getScratchFile() );
}
/**
* This will parse the tokens in the stream. This will close the
* stream when it is finished parsing.
*
* @throws IOException If there is an error while parsing the stream.
*/
public void parse() throws IOException
{
log.debug( "parse() start" );
try
{
Object token = null;
while( (token = parseNextToken()) != null )
{
streamObjects.add( token );
}
}
catch( IOException io )
{
System.out.println( "" + pdfSource );
throw io;
}
finally
{
pdfSource.close();
}
log.debug( "parse() end" );
}
/**
* This will get the tokens that were parsed from the stream.
*
* @return All of the tokens in the stream.
*/
public List getTokens()
{
return streamObjects;
}
/**
* This will parse the next token in the stream.
*
* @return The next token in the stream or null if there are no more tokens in the stream.
*
* @throws IOException If an io error occurs while parsing the stream.
*/
private Object parseNextToken() throws IOException
{
log.debug( "parseNextToken() start" );
Object retval = null;
skipSpaces();
int nextByte = pdfSource.peek();
if( ((byte)nextByte) == -1 )
{
return null;
}
char c = (char)nextByte;
switch(c)
{
case '<':
{
int leftBracket = pdfSource.read();//pull off first left bracket
c = (char)pdfSource.peek(); //check for second left bracket
pdfSource.unread( leftBracket ); //put back first bracket
if(c == '<')
{
COSDictionary pod = parseCOSDictionary();
skipSpaces();
if((char)pdfSource.peek() == 's')
{
retval = parseCOSStream( pod, file );
}
else
{
retval = pod;
}
}
else
{
retval = parseCOSString();
}
break;
}
case '[': // array
{
retval = parseCOSArray();
break;
}
case '(': // string
retval = parseCOSString();
break;
case '/': // name
retval = parseCOSName();
break;
case 'n': // null
{
String nullString = readString();
if( nullString.equals( "null") )
{
retval = COSNull.NULL;
}
else
{
retval = PDFOperator.getOperator( nullString );
}
break;
}
case 't':
case 'f':
{
String next = readString();
if( next.equals( "true" ) )
{
retval = COSBoolean.TRUE;
break;
}
else if( next.equals( "false" ) )
{
retval = COSBoolean.FALSE;
}
else
{
retval = PDFOperator.getOperator( next );
}
break;
}
case 'R':
{
String line = readString();
if( line.equals( "R" ) )
{
retval = new COSObject( null );
}
else
{
retval = PDFOperator.getOperator( line );
}
break;
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
case '+':
case '.':
{
if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
{
StringBuffer buf = new StringBuffer();
while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
{
buf.append( c );
pdfSource.read();
}
retval = COSNumber.get( buf.toString() );
}
else
{
throw new IOException( "Unknown dir object c='" + c + "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
}
break;
}
case 'I':
{
//Special case for ID operator
String id = "" + (char)pdfSource.read() + (char)pdfSource.read();
if( !id.equals( "ID" ) )
{
throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" );
}
ByteArrayOutputStream imageData = new ByteArrayOutputStream();
boolean foundEnd = false;
if( this.isWhitespace() )
{
//pull off the whitespace character
pdfSource.read();
}
int twoBytesAgo = 0;
int lastByte = pdfSource.read();
int currentByte = pdfSource.read();
int count = 0;
//PDF spec is kinda unclear about this. Should a whitespace
//always appear before EI? Not sure, I found a PDF
//(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
//of the image data and will stop parsing prematurely if there is
//not a check for <whitespace>EI<whitespace>.
while( !(isWhitespace( twoBytesAgo ) &&
lastByte == 'E' &&
currentByte == 'I' &&
isWhitespace()) &&
!pdfSource.isEOF() )
{
imageData.write( lastByte );
twoBytesAgo = lastByte;
lastByte = currentByte;
currentByte = pdfSource.read();
}
pdfSource.unread( 'I' ); //unread the EI operator
pdfSource.unread( 'E' );
retval = PDFOperator.getOperator( "ID" );
((PDFOperator)retval).setImageData( imageData.toByteArray() );
break;
}
default:
{
//we must be an operator
String operator = readOperator();
retval = PDFOperator.getOperator( operator );
}
}
if( log.isDebugEnabled() )
{
log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" );
}
return retval;
}
/**
* This will read an operator from the stream.
*
* @return The operator that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected String readOperator() throws IOException
{
skipSpaces();
//average string size is around 2 and the normal string buffer size is
//about 16 so lets save some space.
StringBuffer buffer = new StringBuffer(4);
while(
!isWhitespace() &&
!isClosing() &&
!pdfSource.isEOF() &&
pdfSource.peek() != (int)'[' &&
pdfSource.peek() != (int)'<' &&
pdfSource.peek() != (int)'(' &&
pdfSource.peek() != (int)'/' &&
(pdfSource.peek() < (int)'0' ||
pdfSource.peek() > (int)'9' ) )
{
buffer.append( (char)pdfSource.read() );
}
return buffer.toString();
}
}