/**
* Copyright (c) 2003, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.pdfparser;
import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSInteger;
import org.pdfbox.cos.COSObject;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.persistence.util.COSObjectKey;
import org.apache.log4j.Logger;
/**
* This class will handle the parsing of the PDF document.
*
* @author Ben Litchfield (ben@csh.rit.edu)
* @version $Revision: 1.23 $
*/
public class PDFParser extends BaseParser
{
private static Logger log = Logger.getLogger( PDFParser.class );
private static final String PDF_HEADER = "%PDF-";
private COSDocument document;
/**
* Temp file directory.
*/
private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) );
/**
* Constructor.
*
* @param input The input stream that contains the PDF document.
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser( InputStream input ) throws IOException
{
super( input );
}
/**
* This is the directory where pdfbox will create a temporary file
* for storing pdf document stream in. By default this directory will
* be the value of the system property java.io.tmpdir.
*
* @param tmpDir The directory to create scratch files needed to store
* pdf document streams.
*/
public void setTempDirectory( File tmpDir )
{
tempDirectory = tmpDir;
}
/**
* This will prase the stream and create the PDF document. This will close
* the stream when it is done parsing.
*
* @throws IOException If there is an error reading from the stream.
*/
public void parse() throws IOException
{
try
{
document = new COSDocument( tempDirectory );
String header = readLine();
if( log.isDebugEnabled() )
{
log.debug( "Header=" + header );
}
if( header.length() < PDF_HEADER.length()+1 )
{
throw new IOException( "Error: Header is corrupt '" + header + "'" );
}
String pdfHeader = header.substring( 0, PDF_HEADER.length() );
try
{
float pdfVersion = Float.parseFloat( header.substring( PDF_HEADER.length(), header.length() ) );
document.setVersion( pdfVersion );
}
catch( NumberFormatException e )
{
throw new IOException( "Error getting pdf version:" + e );
}
skipHeaderFillBytes();
Object nextObject;
boolean wasLastParsedObjectAnXref = false;
try
{
while( (nextObject = parseObject()) != null )
{
if( nextObject instanceof COSObject )
{
COSObject pdfObj = (COSObject)nextObject;
document.addObject( pdfObj );
wasLastParsedObjectAnXref = false;
}
else
{
PDFXref xref = (PDFXref)nextObject;
addXref((PDFXref)nextObject);
wasLastParsedObjectAnXref = true;
}
skipSpaces();
}
}
catch( IOException e )
{
if( wasLastParsedObjectAnXref )
{
//Then we assume that there is just random garbage after
//the xref, not sure why the PDF spec allows this but it does.
}
else
{
//some other error so just pass it along
throw e;
}
}
}
finally
{
pdfSource.close();
}
}
/**
* This will skip a header's binary fill bytes. This is in accordance to
* PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
*
* @throws IOException If there is an error reading from the stream.
*/
protected void skipHeaderFillBytes() throws IOException
{
int c;
c = pdfSource.peek();
if ( c == 37 ) //37 is the % character, a comment
{
// Fill bytes conform with PDF reference
skipSpaces();
}
else if ( c >= 128 )
{
// Fill bytes conform with PDF reference (but without comment sign)
// => skip until EOL
readLine();
}
// else: no fill bytes
}
/**
* This will get the document that was parsed. parse() must be called before this is called.
*
* @return The document that was parsed.
*
* @throws IOException If there is an error getting the document.
*/
public COSDocument getDocument() throws IOException
{
if( document == null )
{
throw new IOException( "You must call parse() before calling getDocument()" );
}
return document;
}
/**
* This will get the PD document that was parsed.
*
* @return The document at the PD layer.
*
* @throws IOException If there is an error getting the document.
*/
public PDDocument getPDDocument() throws IOException
{
return new PDDocument( getDocument() );
}
/**
* This will parse a document object from the stream.
*
* @param file The raf used for parsing.
*
* @return The parsed object.
*
* @throws IOException If an IO error occurs.
*/
private Object parseObject() throws IOException
{
Object object = null;
char peekedChar = (char)pdfSource.peek();
if( pdfSource.isEOF() )
{
//end of file we will return a null object and call it a day.
}
else if( peekedChar == 'x' ||
peekedChar == 't' )
{
//System.out.println( "parseObject() parsing xref" );
String xref = null;
int number = 0;
int genNumber = 0;
//FDF documents do not always have the xref
if( peekedChar == 'x' )
{
xref = readLine();
number = readInt();
genNumber = readInt();
}
String nextLine = readString();
while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() )
{
//skip past all the xref entries.
nextLine = readString();
}
skipSpaces();
COSDictionary parsedTrailer = parseCOSDictionary();
COSDictionary docTrailer = document.getTrailer();
if( log.isDebugEnabled() )
{
log.debug( "parsedTrailer=" + parsedTrailer );
log.debug( "docTrailer=" + docTrailer );
}
if( docTrailer == null )
{
document.setTrailer( parsedTrailer );
}
else
{
docTrailer.addAll( parsedTrailer );
}
if( log.isDebugEnabled() )
{
log.debug( "Final trailer=" + docTrailer );
}
object = new PDFXref( number, genNumber );
if( peekedChar == 'x' )
{
skipSpaces();
String startxref = readString();
if( !startxref.equals( "startxref" ) )
{
throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
}
skipSpaces();
int someInt = readInt();
}
//This MUST be readLine because readString strips out comments
//and it will think that %% is a comment in from of the EOF
String eof = readExpectedString( "%%EOF" );
if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
{
throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() + " next=" +readString() );
}
else if( !pdfSource.isEOF() )
{
//we might really be at the end of the file, there might just be some crap at the
//end of the file.
if( pdfSource.available() < 1000 )
{
//We need to determine if we are at the end of the file.
byte[] data = new byte[ 1000 ];
int amountRead = pdfSource.read( data );
if( amountRead != -1 )
{
pdfSource.unread( data, 0, amountRead );
}
boolean atEndOfFile = true;//we assume yes unless we find another.
for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
{
atEndOfFile = !(data[i] == 'E' &&
data[i+1] == 'O' &&
data[i+2] == 'F' );
}
if( atEndOfFile )
{
while( pdfSource.read( data, 0, data.length ) != -1 )
{
//read until done.
}
}
}
}
}
else
{
int number;
int genNum;
String objectKey = null;
try
{
number = readInt();
}
catch( IOException e )
{
//ok for some reason "GNU Ghostscript 5.10" puts two endobj
//statements after an object, of course this is nonsense
//but because we want to support as many PDFs as possible
//we will simply try again
number = readInt();
}
genNum = readInt();
if( log.isDebugEnabled() )
{
log.debug( "Parsing object (" + number + "," + genNum + ")" );
}
objectKey = readString( 3 );
//System.out.println( "parseObject() num=" + number + " genNumber=" + genNum + " key='" + objectKey + "'" );
if( !objectKey.equals( "obj" ) )
{
throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
}
skipSpaces();
COSBase pb = parseDirObject();
String endObjectKey = readString();
if( endObjectKey.equals( "stream" ) )
{
pdfSource.unread( endObjectKey.getBytes() );
pdfSource.unread( ' ' );
if( pb instanceof COSDictionary )
{
pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream forms a complete stream object
throw new IOException("stream not preceded by dictionary");
}
endObjectKey = readString();
}
object = getObjectFromPool(new COSObjectKey(number, genNum));
COSObject pdfObject = (COSObject)object;
pdfObject.setObject(pb);
pdfObject.setObjectNumber( new COSInteger( number ) );
pdfObject.setGenerationNumber( new COSInteger( genNum ) );
if( !endObjectKey.equals( "endobj" ) )
{
if( !pdfSource.isEOF() )
{
//we will try again incase there was some garbage which
//some writers will leave behind.
endObjectKey = readString();
if( !endObjectKey.equals( "endobj" ) )
{
throw new IOException("expected='endobj' actual='" + endObjectKey + "' " + pdfSource);
}
}
}
skipSpaces();
}
//System.out.println( "parsed=" + object );
return object;
}
}