Source Code of it.unimi.dsi.mg4j.document.PdfDocumentFactory

package it.unimi.dsi.mg4j.document;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2005-2010 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.util.Properties;


import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.ObjectInputStream;
import java.io.PipedReader;
import java.io.PipedWriter;


import org.apache.commons.configuration.ConfigurationException;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;




/** A factory that converts PDF (Portable Document Format) documents into text.
 * Presently this class is very inefficient; it is mainly useful for debugging
 * and exemplification purposes. 
 */


public class PdfDocumentFactory extends PropertyBasedDocumentFactory {
  private static final long serialVersionUID = 1L;


  /** Case-insensitive keys for metadata. 
   * 
   *  @see PropertyBasedDocumentFactory.MetadataKeys
   */ 
  public static enum MetadataKeys {
    /** A property specifying that the factory should use the first line of text as a title (not implemented). */
    PARSETITLE,
  } 


  /** A PDF text stripper that will be used to extract text from PDF documents. */
  private transient PDFTextStripper textStripper;
  /** The word reader used for all documents. */
  private final WordReader wordReader;


  protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {
    if ( sameKey( MetadataKeys.PARSETITLE, key ) ) {
      /*metadata.put( PARSE_TITLE, value );
      return true;*/
      throw new ConfigurationException( "PARSETITLE is not yet implemented" );
    }
    
    return super.parseProperty( key, values, metadata );
  }


  public PdfDocumentFactory() throws IOException {
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
  
  public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {
    super( properties );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }


  public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {
    super( defaultMetadata );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }


  public PdfDocumentFactory( final String[] property ) throws IOException, ConfigurationException {
    super( property );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }


  public PdfDocumentFactory copy() {
    try {
      return new PdfDocumentFactory( defaultMetadata );
    }
    catch ( IOException e ) {
      throw new RuntimeException( e );
    }
  }
  
  public int numberOfFields() {
    return 1;
  }
  
  public String fieldName( final int field ) {
    ensureFieldIndex( field );
    return "text";
  }
  
  public int fieldIndex( final String fieldName ) {
    return "text".equals( fieldName ) ? 0: -1;
  }
  
  public FieldType fieldType( final int field ) {
    ensureFieldIndex( field );
    return FieldType.TEXT;
  }


  private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
    s.defaultReadObject();
    textStripper = new PDFTextStripper();
  }
  
  public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {
    return new AbstractDocument() {
      
      private PDDocument pdfDocument;
      private Thread pipingThread;
      private PipedReader pipedReader;
      private PipedWriter pipedWriter;
      
      public CharSequence title() {
        return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata );
      }
      
      public String toString() {
        return title().toString();
      }


      public CharSequence uri() {
        return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata );
      }


      public Object content( final int field ) throws IOException {
        ensureFieldIndex( field );
        pipedReader = new PipedReader();
        pipedWriter = new PipedWriter();
        pdfDocument = PDDocument.load( rawContent );
        pipedWriter.connect( pipedReader );
        pipingThread = new Thread() {
          public void run() {
            try {
              textStripper.writeText( pdfDocument, pipedWriter );
              pipedWriter.close();
              pipedWriter = null;
            }
            catch( InterruptedIOException dontCare ) {}
            catch ( IOException e ) {
              throw new RuntimeException( e );
            }
          }
        };
        pipingThread.start();
        return pipedReader;
      }
      
      public WordReader wordReader( int field ) {
        ensureFieldIndex( field );
        // TODO: should depend on locale or something.
        return wordReader;
      }


      public void close() throws IOException {
        super.close();
        if ( pipingThread != null ) {
          try {
            pipingThread.interrupt();
            pipingThread.join();
            pipingThread = null;
          }
          catch ( InterruptedException e ) {
            throw new RuntimeException( e );
          }
        }


        if ( pipedReader != null ) {
          pipedReader.close();
          pipedReader = null;
        }
        if ( pipedWriter != null ) {
          pipedWriter.close();
          pipedWriter = null;
        }
        if ( pdfDocument != null ) {
          pdfDocument.close();
          pdfDocument = null;
        }
      }
    };
  }
}
Source Code of it.unimi.dsi.mg4j.document.PdfDocumentFactory

Related Classes of it.unimi.dsi.mg4j.document.PdfDocumentFactory