Package com.gentics.cr.lucene.indexer.transformer.pdf

Source Code of com.gentics.cr.lucene.indexer.transformer.pdf.PDFContentTransformer

package com.gentics.cr.lucene.indexer.transformer.pdf;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.util.Date;

import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

import com.gentics.cr.CRResolvableBean;
import com.gentics.cr.configuration.GenericConfiguration;
import com.gentics.cr.exceptions.CRException;
import com.gentics.cr.lucene.indexer.transformer.ContentTransformer;

/**
*
* Last changed: $Date$
* @version $Revision$
* @author $Author$
*
*/
public class PDFContentTransformer extends ContentTransformer {

  private PDFTextStripper stripper = null;
  private static final String TRANSFORMER_ATTRIBUTE_KEY = "attribute";
  private String attribute = "";

  /**
   * Get new instance of PDFContentTransformer
   * @param config
   */
  public PDFContentTransformer(GenericConfiguration config) {
    super(config);
    attribute = (String) config.get(TRANSFORMER_ATTRIBUTE_KEY);
  }

  /**
   * Converts a byte array containing a pdf file to a String that can be indexed by lucene
   * @param obj
   * @return
   */
  private String getStringContents(Object obj) throws CRException {
    ByteArrayInputStream is;
    if (obj instanceof byte[]) {
      is = new ByteArrayInputStream((byte[]) obj);
    } else {
      throw new IllegalArgumentException("Parameter must be instance of byte[]");
    }
    PDDocument pdfDocument = null;
    String contents = null;

    try {
      pdfDocument = PDDocument.load(is);

      if (pdfDocument.isEncrypted()) {
        //Just try using the default password and move on
        pdfDocument.decrypt("");
      }

      //create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      if (stripper == null) {
        stripper = new PDFTextStripper();
      } else {
        stripper.resetEngine();
      }
      stripper.writeText(pdfDocument, writer);

      // Note: the buffer to string operation is costless;
      // the char array value of the writer buffer and the content string
      // is shared as long as the buffer content is not modified, which will
      // not occur here.
      contents = writer.getBuffer().toString();

    } catch (IOException e) {
      throw new CRException(e);
    } catch (CryptographyException e) {
      throw new CRException(e);
    } catch (InvalidPasswordException e) {
      throw new CRException(e);
    } catch (Exception e) {
      //Catch all Exceptions happening here to not disturb the indexer
      e.printStackTrace();
    } finally {
      if (pdfDocument != null) {
        try {
          pdfDocument.close();
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
    return (contents);
  }

  @Override
  public void processBean(CRResolvableBean bean) throws CRException {
    if (this.attribute != null) {
      Object obj = bean.get(this.attribute);
      if (obj != null) {
        long s = new Date().getTime();
        String newString = getStringContents(obj);
        if (newString != null) {
          bean.set(this.attribute, newString);
        }
        long e = new Date().getTime();
        LOGGER.debug("transforming object " + bean.getContentid() + " (" + bean.getMimetype() + ") took "
            + (e - s) + "ms");
      }
    } else {
      LOGGER.error("Configured attribute is null. Bean will not be processed");
    }

  }

  @Override
  public void destroy() {
    // TODO Auto-generated method stub

  }
}
TOP

Related Classes of com.gentics.cr.lucene.indexer.transformer.pdf.PDFContentTransformer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.