Package pdfrobot.engine.parser

Source Code of pdfrobot.engine.parser.PdfFileParser

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

package pdfrobot.engine.parser;

import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/**
*Extracts text from a PDF file.
* @author hedsttor
*/
public class PdfFileParser {

    /**
     * Extracts text from a PDF file.
     * @param pdfFile
     * @return
     * @throws IOException
     */
    public String parsePdf(File pdfFile) throws IOException {
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Starting text extraction... Loading document.");
        PDDocument pdfDocument = PDDocument.load(pdfFile);
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Document loaded... Extracting text.");
        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        String text=pdfTextStripper.getText(pdfDocument);       
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Text extracted... Returning.");
        pdfDocument.close();
        return text;
    }
}
TOP

Related Classes of pdfrobot.engine.parser.PdfFileParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.