Source Code of net.fp.rp.search.back.extractor.PdfDataExtractor

/*
 * Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
 * built with the help of Fast-Soft (fastsoftdev@yahoo.com)
 *
 * released under terms of the GPL license
 * http://www.opensource.org/licenses/gpl-license.php
 *
 * This product includes software developed by the
 * Apache Software Foundation (http://www.apache.org)."
 *
 * This product includes software developed by the
 * Spring Framework Project (http://www.springframework.org)."
 *
 */
package net.fp.rp.search.back.extractor;


import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.util.MessageUtil;
import net.fp.rp.search.common.util.Util;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.INewInformation;


import org.apache.log4j.Logger;


import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;


import org.pdfbox.util.PDFTextStripper;


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;


import java.util.LinkedList;
import java.util.regex.Pattern;




/**
 * Extracts information from pdf files in a format that can be stored and added
 * to the Index.
 *
 * @author brownpa
 * Copyright @link www.firstpartners.net/red
 */
public class PdfDataExtractor extends GenericDataExtractor {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());


    /**
     * The original place where we got this data
     *
     * @return pointer
     */
    public String getOriginalUri() {
        return "";
    }


    /**
     * Carry out any initiation tasks
     */
    public void onLoad() {
    }


    /**
     * How well the plugin thinks it can handle a new piece of information
     *
     * @param info Information to be handled
     *
     * @return 1 In case that the open for the location is succesfully.
     */
    public int canHandle(INewInformation info) {
        logger.debug(
            "PDFExtractor - validate handling of the information from " +
            info.getUri());


        int returnInt = 0;


        //get the location extension 
        String extension = UtilExtract.getLocationExtension(info.getUri());
        logger.debug("PDFExtractor extension : " + extension);


        //validate if the extension is supported by the extractor 
        if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
            InputStream in = null;


            try {
                in = UtilExtract.getStream(info.getUri());
                returnInt = 3;
            } catch (RpException e) {
            } finally {
                try {
                    if (in != null) {
                        in.close();
                    }
                } catch (IOException e) {
                }
            }
        }


        return returnInt;
    }


    /**
     * Convert the file information into tuples
     *
     * @param info Information to be converted
     *
     * @throws RpException If an error occur in processing the file
     */
    public void convert(INewInformation info) throws RpException {
        logger.info("PDFExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());


        //extraction the information from the files only if the level is present (>0)
        if (info.getLevel() >= 0) {
            InputStream in = UtilExtract.getStream(info.getUri());


            ByteArrayOutputStream bout = null;
            Writer writer = null;
            PDDocument document = null;


            
            try {


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");




              
              //load the document
                document = PDDocument.load(in);


                String author = "";
                String title = "";
                String summary = "";


                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();


                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }


                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }


                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);
                }


                //set the buffer
                bout = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(bout);


                //strip the document to the buffer 
                stripper.writeText(document, writer);
                bout.flush();
                writer.flush();


                //construct the patterns (to not ignore and replace)
                Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
                Pattern replacePattern = Pattern.compile(getReplaceChars());


                NodeStruct node = new NodeStruct();
                ByteArrayInputStream bin = null;


                try {
                    bin = new ByteArrayInputStream(bout.toByteArray());


                    byte[] buffer = new byte[1024];
                    int n = bin.read(buffer);


                    while (n > 0) {
                        String chars = new String(buffer, 0, n);


                        //generate the list of the words for the buffer
                        LinkedList listWords = UtilExtract.getValueList(chars,
                                getMinLengthWord(), notIgnorePattern,
                                replacePattern);


                        for (int j = 0; j < listWords.size(); j++)
                            node.addTuple(TupleStruct.KEYWORD_GENERIC,
                                (String) listWords.get(j));


                        n = bin.read(buffer);
                    }


                    logger.debug("Title is " + title + "Path is :" +
                        info.getUri() + "author" + author + " Summary:" +
                        summary);


                    //set the summary field according to the defualt settings 
                    if (summary.length() > getMaxLengthSummary()) {
                        summary = summary.substring(0, getMaxLengthSummary());
                    }


                    DocumStruct doc = new DocumStruct();
                    doc.setTitle(title);
                    doc.setPath(info.getUri());
                    doc.setDescription(summary);
                    doc.setContent(node);
                    doc.setCategoryName(info.getCategoryName());
                    doc.setCategoryLocation(info.getCategoryLocation());


                    //set the pdf -author
                    doc.setAuthor(author);


                    //store and reindex document
                    PluginManager.storeAndAddDocument(doc);
                } catch (IOException e) {
                    logger.debug("Exception in reading the document text" +
                        e.getMessage(), e);
                    throw new RpException("extractor.pdf.textdatamissing",
                        new Object[] { info.getUri() });
                } finally {
                    try {
                        if (bin != null) {
                            bin.close();
                        }
                    } catch (Exception e) {
                    }
                }
            } catch (IOException e) {
                logger.debug("Exception in reading the document text" +
                    e.getMessage(), e);
                throw new RpException("app.extract.error",
                    new Object[] { info.getUri() });
            } finally {
                try {
                    if (writer != null) {
                        writer.close();
                    }


                    if (bout != null) {
                        bout.close();
                    }


                    if (document != null) {
                        document.close();
                    }
                } catch (Exception e) {
                }
            }
        } else {
            logger.debug(
                "Current level for the information is already 0-> no futher process for this location");
        }
    }
}
Source Code of net.fp.rp.search.back.extractor.PdfDataExtractor

Related Classes of net.fp.rp.search.back.extractor.PdfDataExtractor