Package org.sf.mustru.filters

Source Code of org.sf.mustru.filters.PdfHandler

package org.sf.mustru.filters;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;

import org.apache.log4j.Logger;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.encryption.DocumentEncryption;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.exceptions.CryptographyException;

import org.pdfbox.util.PDFTextStripper;

import org.sf.mustru.docs.IndexableDoc;
//import org.sf.mustru.docs.TextDoc;
import org.sf.mustru.utils.*;

/**
* Extract text and metadata from a PDF file using the PDFBox class
*/
public class PdfHandler implements HandlerInterface
{
static Logger logger = Logger.getLogger(PdfHandler.class.getName());

/**
  * empty constructor
  */
public PdfHandler() { super(); }

/**
  * Convert a PDF file into text and save PDF fields in a IndexableDoc object
  */
public void getDocument(String ifile, IndexableDoc doc
{
  doc.setFileType("text"); doc.setFileName(ifile);
  COSDocument cosDoc = null;
  logger.info("Extracting text from PDF file " + ifile);
  try
  { cosDoc = parseDocument(new FileInputStream(new File(ifile)) ); }
  catch (OutOfMemoryError exc)
  {  logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
  return; }
  catch (IOException e)
  { logger.error("Cannot read PDF document " + ifile + " " +  e.getMessage());
  return; }
  catch (Exception e)
  { logger.error("Could not parse PDF document" + ifile + " " + e.getMessage());
  return; }

  // decrypt the PDF document, if it is encrypted -- use a blank password
  try
  {
   String password = "";
   if ( (cosDoc != null) && (cosDoc.isEncrypted()) )
   { DocumentEncryption decryptor = new DocumentEncryption(cosDoc); decryptor.decryptDocument(password); }
  }
  catch (CryptographyException e)
  { logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc)
  return; }
  catch (InvalidPasswordException e)
  { logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc)
  return; }
  catch (IOException e)
  { logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc)
  return; }

  //*-- extract PDF document's textual content
  String docText = null;
  try
  { PDFTextStripper stripper = new PDFTextStripper();
    docText = stripper.getText(new PDDocument(cosDoc));
  }
  catch (OutOfMemoryError exc)
  { closeCOSDocument(cosDoc);
    logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
  }
  catch (Exception e)
  { closeCOSDocument(cosDoc);
    logger.error("Cannot get text from PDF document " + ifile + " " + e.getMessage());
    return;
  }
  //*-- Extract the entire text and save in the contents
  if (docText != null)
  { docText = StringTools.filterChars(docText); doc.setContents(new StringBuffer(docText) ); }

  //*-- Extract PDF document's meta-data
  PDDocument pdDoc = null;
  try
  {
   logger.info("Extracting metadata from PDF file " + ifile);
   pdDoc = new PDDocument(cosDoc);
   PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
   String author   = StringTools.filterChars(docInfo.getAuthor());
   String title    = StringTools.filterChars(docInfo.getTitle());
   String keywords = StringTools.filterChars(docInfo.getKeywords());
   String summary  = StringTools.filterChars(docInfo.getSubject());
   if ((author != null) && (!author.equals("")))     { doc.setAuthor(author); }
   if ((title != null) && (!title.equals("")))       { doc.setTitle(title); }
   if ((keywords != null) && (!keywords.equals(""))) { doc.setMetadata(keywords); }
   if ((summary != null) && (!summary.equals("")))   { doc.setSummary(summary); }
  }
  catch (OutOfMemoryError e)
  { logger.info("Ran out of memory for PDF file metadata:  " + ifile + " " + e.getMessage()); }
  catch (Exception e)
  { logger.info("Did not get PDF document metadata: " + ifile + " " + e.getMessage()); }
  finally
  { closeCOSDocument(cosDoc); closePDDocument(pdDoc); }

  return;
} //*-- end of getDocument

private static COSDocument parseDocument(InputStream is) throws IOException
{ PDFParser parser = new PDFParser(is);
   parser.parse();
   return parser.getDocument();
}

private void closeCOSDocument(COSDocument cosDoc)
{ if (cosDoc != null)
    { try { cosDoc.close(); }
      catch (IOException e) { }
    }
}

private void closePDDocument(PDDocument pdDoc)
{ if (pdDoc != null)
    { try { pdDoc.close(); }
      catch (IOException e) { }
    }
}

}
TOP

Related Classes of org.sf.mustru.filters.PdfHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.