/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package pdfdb.parsing;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.Hashtable;
import java.util.List;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.common.PDRectangle;
import org.pdfbox.util.PDFTextStripperByArea;
/** The sliding parser, takes a rectangle and slowly moves the rectangle
* down, extracting text within the rectangle shape.
* @author ug22cmg */
class SlidingAreaParser extends PDFTextStripperByArea implements PDFParser
{
private int x = 0;
private int y = 0;
private double width = 0;
private double height = 0;
private String text = null;
private Hashtable<String, String> table = new Hashtable<String, String>();
private PDPage page = null;
/** Instantiates a new SlidingAreaParser.
* @param doc The document to parse.
* @param doneTitle Whether a title has been previously computed.
* @param doneBody Whether the body text has been previously computed.
* @throws java.io.IOException If an error occurs reading the pdf.
* @throws pdfdb.parsing.ParserException If the document has no media box
* or the media box has a non-valid height or width. */
public SlidingAreaParser(PDDocument doc, boolean doneTitle, boolean doneBody)
throws IOException, ParserException
{
super();
init(doc);
}
/** Initializes the parser with a document.
* @param doc The document to initialize.
* @throws pdfdb.parsing.ParserException If no media box exists or the
* media box has an invalid height or width. */
private void init(PDDocument doc) throws ParserException
{
List pages = doc.getDocumentCatalog().getAllPages();
if (pages != null)
{
PDRectangle mediaBox = null;
this.page = (PDPage) pages.get(0);
mediaBox = page.getMediaBox();
if (mediaBox != null)
{
this.width = mediaBox.getWidth();
this.height = mediaBox.getHeight();
}
else
{
throw new ParserException();
}
}
else
{
throw new ParserException();
}
}
/** Gets the estimate at body text.
* @return The body text or null. */
@Override
public String getEstimateAtBodyText()
{
return text.replace(getEstimateAtTitle(), "");
}
/** Gets the estimate of the summary text.
* @return Summary text or null. */
@Override
public String getEstimateAtSummary()
{
String summary = table.get("SUMMARY");
String title = getEstimateAtTitle();
if (summary != null && title != null)
summary = summary.replace(title, "");
return summary;
}
/** Gets an estimate at the title text.
* @return Title text or null. */
@Override
public String getEstimateAtTitle()
{
return table.get("TITLE");
}
/** Not provided.
* @return Always null. */
@Override
public String getEstimateAtSubTitle()
{
return null;
}
/** Performs the sliding parse, v. slow and intense on system resources.
* @param doc The document to parse.
* @throws java.io.IOException If an error occurs while reading the
* document. */
private void slidingParse(PDDocument doc) throws IOException
{
int h = 500;
Rectangle2D rect = new Rectangle2D.Double(x, y, width, h);
Rectangle2D rect2 = new Rectangle2D.Double(x, h, width, h);
boolean firstTime = true;
try
{
while ((table.get("TITLE") == null || table.get("TITLE").length() < 3) && rect.
getY() < height)
{
if (!firstTime)
{
h += 500;
rect = new Rectangle2D.Double(x, y + h, width, 500);
rect2 = new Rectangle2D.Double(x, y + h, width, 1000);
}
super.getRegions().clear();
super.addRegion("TITLE", rect);
super.extractRegions(page);
table.put("TITLE", super.getTextForRegion("TITLE"));
firstTime = false;
}
super.getRegions().clear();
super.addRegion("SUMMARY", rect2);
super.extractRegions(page);
table.put("SUMMARY", super.getTextForRegion("SUMMARY"));
}
catch (final OutOfMemoryError e)// we can actually do this
{
System.err.println("Out of memory");
System.exit(1);
}
catch (Exception e)
{
}
finally
{
System.gc(); // Absolute must!
}
}
/** Performs the full parse operation. Notice: equates to multiple
* full text document extractions.
* @param doc The document to extract text from.
* @return The full document text.
* @throws java.io.IOException If an error occurs while reading. */
@Override
public String parse(PDDocument doc) throws IOException
{
this.text = getText(doc);
slidingParse(doc);
return text;
}
}