Source Code of net.sf.regain.crawler.preparator.PdfBoxPreparator

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004  Til Schneider
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2011-01-02 18:09:46 +0100 (So, 02 Jan 2011) $
 *   $Author: thtesche $
 * $Revision: 477 $
 */
package net.sf.regain.crawler.preparator;


import java.io.InputStream;
import java.io.IOException;
import java.util.List;
import org.apache.log4j.Logger;


import net.sf.regain.RegainException;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.RawDocument;


import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripper;


/**
 * Präpariert ein PDF-Dokument für die Indizierung.
 * <p>
 * Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit,
 * es wird der Titel extrahiert.
 *
 * @author Til Schneider, www.murfman.de
 */
public class PdfBoxPreparator extends AbstractPreparator {


  /** The logger for this class */
  private static Logger mLog = Logger.getLogger(PdfBoxPreparator.class);


  /**
   * Creates a new instance of PdfBoxPreparator.
   *
   * @throws RegainException If creating the preparator failed.
   */
  public PdfBoxPreparator() throws RegainException {
    super("application/pdf");
  }


  /**
   * Präpariert ein Dokument für die Indizierung.
   *
   * @param rawDocument Das zu pr�pariernde Dokument.
   *
   * @throws RegainException Wenn die Pr�paration fehl schlug.
   */
  @SuppressWarnings("unchecked")
  public void prepare(RawDocument rawDocument) throws RegainException {
    String url = rawDocument.getUrl();


    InputStream stream = null;
    PDDocument pdfDocument = null;


    try {
      // Create a InputStream that reads the content.
      stream = rawDocument.getContentAsStream();


      // Parse the content
      PDFParser parser = new PDFParser(stream);
      parser.parse();
      pdfDocument = parser.getPDDocument();


      // Decrypt the PDF-Dokument
      if (pdfDocument.isEncrypted()) {
        mLog.debug("Document is encrypted: " + url);
        StandardDecryptionMaterial sdm = new StandardDecryptionMaterial("");
        pdfDocument.openProtection(sdm);
        AccessPermission ap = pdfDocument.getCurrentAccessPermission();


        if (!ap.canExtractContent()) {
          throw new RegainException("Document is encrypted and can't be opened: " + url);
        }
      }


      // Extract the text with a utility class
      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setSuppressDuplicateOverlappingText(false);
      stripper.setSortByPosition(true);
      stripper.setStartPage(1);
      stripper.setEndPage(Integer.MAX_VALUE);


      setCleanedContent(stripper.getText(pdfDocument).replaceAll("visiblespace", " "));


      // extract annotations
      StringBuilder annotsResult = new StringBuilder();
      List allPages = pdfDocument.getDocumentCatalog().getAllPages();
      for (int i = 0; i < allPages.size(); i++) {
        int pageNum = i + 1;
        PDPage page = (PDPage) allPages.get(i);
        List<PDAnnotation> annotations = page.getAnnotations();
        if (annotations.size() < 1) {
          continue;
        }
        mLog.debug("Total annotations = " + annotations.size());
        mLog.debug("\nProcess Page " + pageNum + "...");
        for (PDAnnotation annotation : annotations) {
          if (annotation.getContents() != null && annotation.getContents().length() > 0) {
            annotsResult.append(annotation.getContents());
            annotsResult.append(" ");
            mLog.debug("Text from annotation: " + annotation.getContents());
          }
        }
      }
      if (annotsResult.length() > 0) {
        setCleanedContent(getCleanedContent() + " Annotations " + annotsResult.toString());
      }


      // Get the meta data
      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      StringBuilder metaData = new StringBuilder();
      metaData.append("p.");
      metaData.append(Integer.toString(pdfDocument.getNumberOfPages()));
      metaData.append(" ");


      // Check if fields are null
      if (info.getAuthor() != null) {
        metaData.append(info.getAuthor());
        metaData.append(" ");
      }
      if (info.getSubject() != null) {
        metaData.append(info.getSubject());
        metaData.append(" ");
      }
      if (info.getKeywords() != null) {
        metaData.append(info.getKeywords());
        metaData.append(" ");
      }


      if (info.getTitle() != null) {
        setTitle(info.getTitle());
      }


      setCleanedMetaData(metaData.toString());
      if (mLog.isDebugEnabled()) {
        mLog.debug("Extracted meta data ::" + getCleanedMetaData()
                + ":: from " + rawDocument.getUrl());
      }


    } catch (CryptographyException exc) {
      throw new RegainException("Error decrypting document: " + url, exc);


    } catch (BadSecurityHandlerException exc) {
      // They didn't supply a password and the default of "" was wrong.
      throw new RegainException("Document is encrypted: " + url, exc);


    } catch (IOException exc) {
      throw new RegainException("Error reading document: " + url, exc);


    } finally {
      if (stream != null) {
        try {
          stream.close();
        } catch (Exception exc) {
        }
      }
      if (pdfDocument != null) {
        try {
          pdfDocument.close();
        } catch (Exception exc) {
        }
      }
    }
  }
}
Source Code of net.sf.regain.crawler.preparator.PdfBoxPreparator

Related Classes of net.sf.regain.crawler.preparator.PdfBoxPreparator