/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package pdfrobot.engine.parser;
import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
*Extracts text from a PDF file.
* @author hedsttor
*/
public class PdfFileParser {
/**
* Extracts text from a PDF file.
* @param pdfFile
* @return
* @throws IOException
*/
public String parsePdf(File pdfFile) throws IOException {
Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Starting text extraction... Loading document.");
PDDocument pdfDocument = PDDocument.load(pdfFile);
Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Document loaded... Extracting text.");
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String text=pdfTextStripper.getText(pdfDocument);
Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Text extracted... Returning.");
pdfDocument.close();
return text;
}
}