Package net.fp.rp.search.back.extractor

Source Code of net.fp.rp.search.back.extractor.PdfDataExtractor

/*
* Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
* built with the help of Fast-Soft (fastsoftdev@yahoo.com)
*
* released under terms of the GPL license
* http://www.opensource.org/licenses/gpl-license.php
*
* This product includes software developed by the
* Apache Software Foundation (http://www.apache.org)."
*
* This product includes software developed by the
* Spring Framework Project (http://www.springframework.org)."
*
*/
package net.fp.rp.search.back.extractor;

import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.util.MessageUtil;
import net.fp.rp.search.common.util.Util;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.INewInformation;

import org.apache.log4j.Logger;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;

import org.pdfbox.util.PDFTextStripper;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;

import java.util.LinkedList;
import java.util.regex.Pattern;


/**
* Extracts information from pdf files in a format that can be stored and added
* to the Index.
*
* @author brownpa
* Copyright @link www.firstpartners.net/red
*/
public class PdfDataExtractor extends GenericDataExtractor {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());

    /**
     * The original place where we got this data
     *
     * @return pointer
     */
    public String getOriginalUri() {
        return "";
    }

    /**
     * Carry out any initiation tasks
     */
    public void onLoad() {
    }

    /**
     * How well the plugin thinks it can handle a new piece of information
     *
     * @param info Information to be handled
     *
     * @return 1 In case that the open for the location is succesfully.
     */
    public int canHandle(INewInformation info) {
        logger.debug(
            "PDFExtractor - validate handling of the information from " +
            info.getUri());

        int returnInt = 0;

        //get the location extension
        String extension = UtilExtract.getLocationExtension(info.getUri());
        logger.debug("PDFExtractor extension : " + extension);

        //validate if the extension is supported by the extractor
        if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
            InputStream in = null;

            try {
                in = UtilExtract.getStream(info.getUri());
                returnInt = 3;
            } catch (RpException e) {
            } finally {
                try {
                    if (in != null) {
                        in.close();
                    }
                } catch (IOException e) {
                }
            }
        }

        return returnInt;
    }

    /**
     * Convert the file information into tuples
     *
     * @param info Information to be converted
     *
     * @throws RpException If an error occur in processing the file
     */
    public void convert(INewInformation info) throws RpException {
        logger.info("PDFExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());

        //extraction the information from the files only if the level is present (>0)
        if (info.getLevel() >= 0) {
            InputStream in = UtilExtract.getStream(info.getUri());

            ByteArrayOutputStream bout = null;
            Writer writer = null;
            PDDocument document = null;

           
            try {

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");


             
              //load the document
                document = PDDocument.load(in);

                String author = "";
                String title = "";
                String summary = "";

                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();

                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }

                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }

                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);
                }

                //set the buffer
                bout = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(bout);

                //strip the document to the buffer
                stripper.writeText(document, writer);
                bout.flush();
                writer.flush();

                //construct the patterns (to not ignore and replace)
                Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
                Pattern replacePattern = Pattern.compile(getReplaceChars());

                NodeStruct node = new NodeStruct();
                ByteArrayInputStream bin = null;

                try {
                    bin = new ByteArrayInputStream(bout.toByteArray());

                    byte[] buffer = new byte[1024];
                    int n = bin.read(buffer);

                    while (n > 0) {
                        String chars = new String(buffer, 0, n);

                        //generate the list of the words for the buffer
                        LinkedList listWords = UtilExtract.getValueList(chars,
                                getMinLengthWord(), notIgnorePattern,
                                replacePattern);

                        for (int j = 0; j < listWords.size(); j++)
                            node.addTuple(TupleStruct.KEYWORD_GENERIC,
                                (String) listWords.get(j));

                        n = bin.read(buffer);
                    }

                    logger.debug("Title is " + title + "Path is :" +
                        info.getUri() + "author" + author + " Summary:" +
                        summary);

                    //set the summary field according to the defualt settings
                    if (summary.length() > getMaxLengthSummary()) {
                        summary = summary.substring(0, getMaxLengthSummary());
                    }

                    DocumStruct doc = new DocumStruct();
                    doc.setTitle(title);
                    doc.setPath(info.getUri());
                    doc.setDescription(summary);
                    doc.setContent(node);
                    doc.setCategoryName(info.getCategoryName());
                    doc.setCategoryLocation(info.getCategoryLocation());

                    //set the pdf -author
                    doc.setAuthor(author);

                    //store and reindex document
                    PluginManager.storeAndAddDocument(doc);
                } catch (IOException e) {
                    logger.debug("Exception in reading the document text" +
                        e.getMessage(), e);
                    throw new RpException("extractor.pdf.textdatamissing",
                        new Object[] { info.getUri() });
                } finally {
                    try {
                        if (bin != null) {
                            bin.close();
                        }
                    } catch (Exception e) {
                    }
                }
            } catch (IOException e) {
                logger.debug("Exception in reading the document text" +
                    e.getMessage(), e);
                throw new RpException("app.extract.error",
                    new Object[] { info.getUri() });
            } finally {
                try {
                    if (writer != null) {
                        writer.close();
                    }

                    if (bout != null) {
                        bout.close();
                    }

                    if (document != null) {
                        document.close();
                    }
                } catch (Exception e) {
                }
            }
        } else {
            logger.debug(
                "Current level for the information is already 0-> no futher process for this location");
        }
    }
}
TOP

Related Classes of net.fp.rp.search.back.extractor.PdfDataExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.