Package org.archive.modules.extractor

Source Code of org.archive.modules.extractor.PDFParser

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.modules.extractor;

import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfObject;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PdfArray;

import java.io.*;
import java.util.*;


/** Supports PDF parsing operations.  For now this primarily means
*  extracting URIs, but the logic in extractURIs() could easily be adopted/extended
* for a variety of PDF processing tasks.
*
* @author Parker Thompson
*
*/
//TODO make this more effecient, it currently had to read the whole file into memory
// before processing can begin, and appears to take much longer than it "should"
// to parse small, but admittedly complex, documents.
public class PDFParser {

    protected ArrayList<String> foundURIs;
    protected ArrayList<ArrayList<Integer>> encounteredReferences;
    protected PdfReader documentReader;
    protected byte[] document;
    protected PdfDictionary catalog;

    public PDFParser(String doc) throws IOException {
        resetState();
        getInFromFile(doc);
        initialize();
    }
    public PDFParser(byte[] doc) throws IOException{
        resetState();
        document = doc;
        initialize();
    }

    /** Reinitialize the object as though a new one were created.
     */
    protected void resetState(){
        foundURIs = new ArrayList<String>();
        encounteredReferences = new ArrayList<ArrayList<Integer>>();
        documentReader = null;
        document = null;
        catalog = null;

        for(int i=0; i < encounteredReferences.size(); i++){
            encounteredReferences.add(new ArrayList<Integer>());
        }
    }

    /**
     * Reset the object and initialize it with a new byte array (the document).
     * @param doc
     * @throws IOException
     */
    public void resetState(byte[] doc) throws IOException{
        resetState();
        document = doc;
        initialize();
    }

    /** Reinitialize the object as though a new one were created, complete
     * with a valid pointer to a document that can be read
     * @param doc
     * @throws IOException
     */
    public void resetState(String doc) throws IOException{
        resetState();
        getInFromFile(doc);
        initialize();
    }

    /**
     * Read a file named 'doc' and store its' bytes for later processing.
     * @param doc
     * @throws IOException
     */
    protected void getInFromFile(String doc) throws IOException{
        File documentOnDisk = new File(doc);

        long length = documentOnDisk.length();
        document = new byte[(int)length];

        FileInputStream inStream = new FileInputStream(documentOnDisk);

        inStream.read(document);
    }

    /**
     * Indicates, based on a PDFObject's generation/id pair whether
     * the parser has already encountered this object (or a reference to it)
     * so we don't infinitely loop on circuits within the PDF.
     * @param generation
     * @param id
     * @return True if already seen.
     */
    protected boolean haveSeen(int generation, int id){

        // if we can't store this generation grow our list until we can
        if(generation >= encounteredReferences.size()){
            for(int i=encounteredReferences.size(); i <= generation; i++){
                encounteredReferences.add(new ArrayList<Integer>());
            }

            // clearly we haven't seen it
            return false;
        }

        ArrayList<Integer> generationList
         = encounteredReferences.get(generation);
       
        for (int i: generationList) {
            if(i == id){
                return true;
            }
        }
        return false;
    }

    /**
     * Note that an object (id/generation pair) has been seen by this parser
     * so that it can be handled differently when it is encountered again.
     * @param generation
     * @param id
     */
    protected void markAsSeen(int generation, int id){
        ArrayList<Integer> objectIds = encounteredReferences.get(generation);
        objectIds.add(id);
    }

    /**
     * Get a list of URIs retrieved from the Pdf during the
     * extractURIs operation.
     * @return A list of URIs retrieved from the Pdf during the
     * extractURIs operation.
     */
    public ArrayList<String> getURIs(){
        return foundURIs;
    }

    /**
     * Initialize opens the document for reading.  This is done implicitly
     * by the constuctor.  This should only need to be called directly following
     * a reset.
     * @throws IOException
     */
    protected void initialize() throws IOException{
        if(document != null){
            documentReader = new PdfReader(document);
        }

        catalog = documentReader.getCatalog();
    }

    /**
     * Extract URIs from all objects found in a Pdf document's catalog.
     * Returns an array list representing all URIs found in the document catalog tree.
     * @return URIs from all objects found in a Pdf document's catalog.
     */
    public ArrayList<String> extractURIs(){
        extractURIs(catalog);
        return getURIs();
    }

    /**
     * Parse a PdfDictionary, looking for URIs recursively and adding
     * them to foundURIs
     * @param entity
     */
    @SuppressWarnings("unchecked")
    protected void extractURIs(PdfObject entity){

            // deal with dictionaries
            if(entity.isDictionary()){

                PdfDictionary dictionary= (PdfDictionary)entity;

                Set<PdfName> allkeys = dictionary.getKeys();
                for (PdfName key: allkeys) {
                    PdfObject value = dictionary.get(key);

                    // see if it's the key is a UR[I,L]
                    if( key.toString().equals("/URI") ||
                key.toString().equals("/URL") ) {
                        foundURIs.add(value.toString());

                    }else{
                        this.extractURIs(value);
                    }

                }

            // deal with arrays
            }else if(entity.isArray()){

                PdfArray array = (PdfArray)entity;
                for (PdfObject pdfObject : (Iterable<PdfObject>)array.getArrayList()) {
                    this.extractURIs(pdfObject);
                }

            // deal with indirect references
            }else if(entity.getClass() == PRIndirectReference.class){

                    PRIndirectReference indirect = (PRIndirectReference)entity;

                    // if we've already seen a reference to this object
                    if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){
                        return;

                    // note that we've seen it if it's new
                    }else{
                        markAsSeen(indirect.getGeneration(), indirect.getNumber() );
                    }

                    // dereference the "pointer" and process the object
                    indirect.getReader(); // FIXME: examine side-effects
                    PdfObject direct = PdfReader.getPdfObject(indirect);

                    this.extractURIs(direct);
            }
    }

    public static void main(String[] argv){

        try {
            PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
            ArrayList<String> uris = parser.extractURIs();
            Iterator<String> i = uris.iterator();
            while(i.hasNext()){
                String uri = (String)i.next();
                System.out.println("got uri: " + uri);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
TOP

Related Classes of org.archive.modules.extractor.PDFParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.