Package pdfdb.parsing

Source Code of pdfdb.parsing.SlidingAreaParser

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package pdfdb.parsing;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.Hashtable;
import java.util.List;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.common.PDRectangle;
import org.pdfbox.util.PDFTextStripperByArea;

/** The sliding parser, takes a rectangle and slowly moves the rectangle
* down, extracting text within the rectangle shape.
* @author ug22cmg */
class SlidingAreaParser extends PDFTextStripperByArea implements PDFParser
{

    private int x = 0;
    private int y = 0;
    private double width = 0;
    private double height = 0;
    private String text = null;
    private Hashtable<String, String> table = new Hashtable<String, String>();
    private PDPage page = null;

    /** Instantiates a new SlidingAreaParser.
     * @param doc The document to parse.
     * @param doneTitle Whether a title has been previously computed.
     * @param doneBody Whether the body text has been previously computed.
     * @throws java.io.IOException If an error occurs reading the pdf.
     * @throws pdfdb.parsing.ParserException If the document has no media box
     * or the media box has a non-valid height or width. */
    public SlidingAreaParser(PDDocument doc, boolean doneTitle, boolean doneBody)
            throws IOException, ParserException
    {
        super();
        init(doc);
    }

    /** Initializes the parser with a document.
     * @param doc The document to initialize.
     * @throws pdfdb.parsing.ParserException If no media box exists or the
     * media box has an invalid height or width. */
    private void init(PDDocument doc) throws ParserException
    {
        List pages = doc.getDocumentCatalog().getAllPages();
        if (pages != null)
        {
            PDRectangle mediaBox = null;
            this.page = (PDPage) pages.get(0);
            mediaBox = page.getMediaBox();
            if (mediaBox != null)
            {
                this.width = mediaBox.getWidth();
                this.height = mediaBox.getHeight();
            }
            else
            {
                throw new ParserException();
            }
        }
        else
        {
            throw new ParserException();
        }
    }

    /** Gets the estimate at body text.
     * @return The body text or null. */
    @Override
    public String getEstimateAtBodyText()
    {
        return text.replace(getEstimateAtTitle(), "");
    }

    /** Gets the estimate of the summary text.
     * @return Summary text or null. */
    @Override
    public String getEstimateAtSummary()
    {
        String summary = table.get("SUMMARY");
        String title = getEstimateAtTitle();
        if (summary != null && title != null)
            summary = summary.replace(title, "");
        return summary;
    }

    /** Gets an estimate at the title text.
     * @return Title text or null. */
    @Override
    public String getEstimateAtTitle()
    {
        return table.get("TITLE");
    }

    /** Not provided.
     * @return Always null. */
    @Override
    public String getEstimateAtSubTitle()
    {
        return null;
    }

    /** Performs the sliding parse, v. slow and intense on system resources.
     * @param doc The document to parse.
     * @throws java.io.IOException If an error occurs while reading the
     * document. */
    private void slidingParse(PDDocument doc) throws IOException
    {
        int h = 500;
        Rectangle2D rect = new Rectangle2D.Double(x, y, width, h);
        Rectangle2D rect2 = new Rectangle2D.Double(x, h, width, h);
        boolean firstTime = true;
        try
        {
            while ((table.get("TITLE") == null || table.get("TITLE").length() < 3) && rect.
                    getY() < height)
            {
                if (!firstTime)
                {
                    h += 500;
                    rect = new Rectangle2D.Double(x, y + h, width, 500);
                    rect2 = new Rectangle2D.Double(x, y + h, width, 1000);
                }

                super.getRegions().clear();
                super.addRegion("TITLE", rect);
                super.extractRegions(page);
                table.put("TITLE", super.getTextForRegion("TITLE"));
                firstTime = false;
            }
            super.getRegions().clear();
            super.addRegion("SUMMARY", rect2);
            super.extractRegions(page);
            table.put("SUMMARY", super.getTextForRegion("SUMMARY"));
        }
        catch (final OutOfMemoryError e)// we can actually do this
        {
            System.err.println("Out of memory");
            System.exit(1);
        }
        catch (Exception e)
        {
        }
        finally
        {
            System.gc(); // Absolute must!
        }
    }

    /** Performs the full parse operation. Notice: equates to multiple
     * full text document extractions.
     * @param doc The document to extract text from.
     * @return The full document text.
     * @throws java.io.IOException If an error occurs while reading. */
    @Override
    public String parse(PDDocument doc) throws IOException
    {
        this.text = getText(doc);
        slidingParse(doc);
        return text;
    }
}
TOP

Related Classes of pdfdb.parsing.SlidingAreaParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.