Package org.wso2.carbon.registry.indexing.indexer

Source Code of org.wso2.carbon.registry.indexing.indexer.PDFIndexer

package org.wso2.carbon.registry.indexing.indexer;

import java.io.ByteArrayInputStream;
import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.wso2.carbon.registry.indexing.AsyncIndexer.File2Index;
import org.wso2.carbon.registry.indexing.solr.IndexDocument;

public class PDFIndexer implements Indexer {
 
  public static final Log log = LogFactory.getLog(PDFIndexer.class);

  public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
    try {
      PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
      parser.parse();
      COSDocument cosDoc = parser.getDocument();

      PDFTextStripper stripper = new PDFTextStripper();
      String docText = stripper.getText(new PDDocument(cosDoc));
      cosDoc.close();

      return new IndexDocument(fileData.path, docText, null);
    } catch (IOException e) {
      String msg = "Failed to write to the index";
      log.error(msg, e);
      throw new SolrException(ErrorCode.SERVER_ERROR, msg);
    }
  }

}
TOP

Related Classes of org.wso2.carbon.registry.indexing.indexer.PDFIndexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.