Package org.apache.pdfbox.util

Examples of org.apache.pdfbox.util.PDFTextStripper$WordSeparator


      PDFParser parser = PDFBox.read(source);
      parser.parse();

      PDDocument document = parser.getPDDocument();
      PDFTextStripper stripper = new PDFTextStripper();

      String text = stripper.getText(document);

      document.close();

      return new DocumentOutput(text);
    } catch (IOException e) {
View Full Code Here


           decryptor.decryptDocument("");
       }
       file = File.createTempFile("extract_pdf", ".tmp");
       indexInfo.addDeleteFile(file);
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
      /*logger.debug("PDF extraction completed");
       BufferedReader reader;
       try {
         reader = new BufferedReader(new FileReader(file));
        String line = null;
View Full Code Here

     */
    public String parsePdf(File pdfFile) throws IOException {
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Starting text extraction... Loading document.");
        PDDocument pdfDocument = PDDocument.load(pdfFile);
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Document loaded... Extracting text.");
        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        String text=pdfTextStripper.getText(pdfDocument);       
        Logger.getLogger(PdfFileParser.class.getName()).log(Level.FINEST, "Text extracted... Returning.");
        pdfDocument.close();
        return text;
    }
View Full Code Here

        if(f!=null) {
            Logger.getLogger(PdfFolderParser.class.getName()).log(Level.FINEST, owningThread+" - Loading pdf document "+f.getName());
            PDDocument pdfDoc = PDDocument.load(f);
            Logger.getLogger(PdfFolderParser.class.getName()).log(Level.FINEST, owningThread+" - Ripping pdf");
            PDFTextStripper pdfTextStripper = new PDFTextStripper();
            String text = pdfTextStripper.getText(pdfDoc);
            pdfDoc.close();
            String[] lines = text.split("\n");
            String line = lines[rule.getLine()];
            if (!line.contains(rule.getPattern())) {
                Logger.getLogger(PdfFolderParser.class.getName()).log(Level.FINEST, owningThread+" - Pattern "+rule.getPattern()+" not found in line "+rule.getLine()+" exiting");
View Full Code Here

       String docText = "";
       PDDocument pdDoc = null;
       try {
           cosDoc = parseDocument(is);
           pdDoc = new PDDocument(cosDoc);
           PDFTextStripper stripper = new PDFTextStripper();
           docText = stripper.getText(pdDoc);
           log.debug("PDF Doc Text "+docText.length());
       }
       finally {
            if( pdDoc == null ) {
                log.error("PdDocument is null");
View Full Code Here

     * @throws Exception
     */
     public static String extractContentPdfDocument(final PDDocument pdfDoc) throws Exception {
         String docText = null;
         try {
             PDFTextStripper stripper = new PDFTextStripper();
             docText = stripper.getText(pdfDoc);
             log.debug("Extract content pdf document leng ----> "+ docText.length());
         }
         finally {
              if( docText == null ) {
                  log.error("****************   PDF content is null   *********************");
 
View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }

                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );
                stripper.writeText( document, output );
            }
            finally
            {
                if( output != null )
                {
View Full Code Here

        if (textPerPage == null) {
            textPerPage = new HashMap<Integer, String>();
        }

        PDFTextStripper strip = null;

        try {
            strip = new PDFTextStripper();
            // Adobe-API uses zero-based page index. PDFTextStripper uses one-based.
            strip.setStartPage(page + 1);
            strip.setEndPage(page + 1);
            textPerPage.put(page, strip.getText(document).trim());
        } catch (IOException e) {
            log.warn("Problem while extracting text from PDF.", e);
        }
    }
View Full Code Here

      PDFParser parser = new PDFParser(source);
      parser.parse();

      PDDocument document = parser.getPDDocument();
      PDFTextStripper stripper = new PDFTextStripper();

      String text = stripper.getText(document);

      document.close();

      return text;
    } catch (IOException e) {
View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }

                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try
View Full Code Here

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripper$WordSeparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.