Examples of org.apache.pdfbox.util.PDFTextStripper

org.apache.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it is up to clients of this class to verify that a specific user has the correct permissions to extract text from the PDF document. The basic flow of this process is that we get a document and use a series of processXXX() functions that work on smaller and smaller chunks of the page. Eventually, we fully process each page and then print it. @author Ben Litchfield


      PDFParser parser = PDFBox.read(source);
      parser.parse();


      PDDocument document = parser.getPDDocument();
      PDFTextStripper stripper = new PDFTextStripper();


      String text = stripper.getText(document);


      document.close();


      return new DocumentOutput(text);
    } catch (IOException e) {

View Full Code Here

           decryptor.decryptDocument("");
       }
       file = File.createTempFile("extract_pdf", ".tmp");
       indexInfo.addDeleteFile(file);
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
      /*logger.debug("PDF extraction completed");
       BufferedReader reader;
       try {
         reader = new BufferedReader(new FileReader(file));
        String line = null;

View Full Code Here

     */
    public String parsePdf(File pdfFile) throws IOException {
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Starting text extraction... Loading document.");
        PDDocument pdfDocument = PDDocument.load(pdfFile);
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Document loaded... Extracting text.");
        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        String text=pdfTextStripper.getText(pdfDocument);        
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Text extracted... Returning.");
        pdfDocument.close();
        return text;
    }

View Full Code Here


        if(f!=null) {
            Logger.getLogger(PdfFolderParser.class.getName()).log(Level.FINEST, owningThread+" - Loading pdf document "+f.getName());
            PDDocument pdfDoc = PDDocument.load(f);
            Logger.getLogger(PdfFolderParser.class.getName()).log(Level.FINEST, owningThread+" - Ripping pdf");
            PDFTextStripper pdfTextStripper = new PDFTextStripper();
            String text = pdfTextStripper.getText(pdfDoc);
            pdfDoc.close();
            String[] lines = text.split("\n");
            String line = lines[rule.getLine()];
            if (!line.contains(rule.getPattern())) {
                Logger.getLogger(PdfFolderParser.class.getName()).log(Level.FINEST, owningThread+" - Pattern "+rule.getPattern()+" not found in line "+rule.getLine()+" exiting");

View Full Code Here

       String docText = "";
       PDDocument pdDoc = null;
       try {
           cosDoc = parseDocument(is);
           pdDoc = new PDDocument(cosDoc);
           PDFTextStripper stripper = new PDFTextStripper();
           docText = stripper.getText(pdDoc);
           log.debug("PDF Doc Text "+docText.length());
       }
       finally {
            if( pdDoc == null ) {
                log.error("PdDocument is null");

View Full Code Here

     * @throws Exception
     */
     public static String extractContentPdfDocument(final PDDocument pdfDoc) throws Exception {
         String docText = null;
         try {
             PDFTextStripper stripper = new PDFTextStripper();
             docText = stripper.getText(pdfDoc);
             log.debug("Extract content pdf document leng ----> "+ docText.length());
         }
         finally {
              if( docText == null ) {
                  log.error("****************   PDF content is null   *********************");

View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );
                stripper.writeText( document, output );
            }
            finally
            {
                if( output != null )
                {

View Full Code Here


        if (textPerPage == null) {
            textPerPage = new HashMap<Integer, String>();
        }


        PDFTextStripper strip = null;


        try {
            strip = new PDFTextStripper();
            // Adobe-API uses zero-based page index. PDFTextStripper uses one-based.
            strip.setStartPage(page + 1);
            strip.setEndPage(page + 1);
            textPerPage.put(page, strip.getText(document).trim());
        } catch (IOException e) {
            log.warn("Problem while extracting text from PDF.", e);
        }
    }

View Full Code Here


      PDFParser parser = new PDFParser(source);
      parser.parse();


      PDDocument document = parser.getPDDocument();
      PDFTextStripper stripper = new PDFTextStripper();


      String text = stripper.getText(document);


      document.close();


      return text;
    } catch (IOException e) {

View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }


                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripper

br.com.objectos.way.reports.htmltopdf.Pdfs

br.net.woodstock.rockframework.document.pdf.pdfbox.GetTextProcessor

br.net.woodstock.rockframework.office.pdf.impl.PDFBoxManager

com.cardence.lawshelf.pdf.PdfBoxParserImpl

com.gentics.cr.lucene.indexer.transformer.pdf.PDFContentTransformer

com.google.code.ftspc.lector.parsers.PDF.PDFParserLocal

com.stimulus.archiva.extraction.PDFExtractor

de.pdf_scrutinizer.document.DocumentAdapter

geopms.GeoPMSImportPDF

ir.Indexer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.