Examples of PDFTextStripper

com.dotcms.repackage.org.apache.pdfbox.util.PDFTextStripper
org.apache.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it is up to clients of this class to verify that a specific user has the correct permissions to extract text from the PDF document. The basic flow of this process is that we get a document and use a series of processXXX() functions that work on smaller and smaller chunks of the page. Eventually, we fully process each page and then print it. @author Ben Litchfield
org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

Examples of com.dotcms.repackage.org.apache.pdfbox.util.PDFTextStripper


            PDDocument pdDoc= parser.getPDDocument();


        StringWriter stringWriter = new StringWriter();


        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setLineSeparator("\n");
        stripper.writeText(pdDoc, stringWriter);


        text = stringWriter.toString();


        stringWriter.close();
        pdDoc.close();
      }
      catch (Exception e) {
        _log.error(e.getMessage());
      }
    }
    else if (fileExt.equals(".rtf")) {
      try {
        DefaultStyledDocument dsd = new DefaultStyledDocument();


        RTFEditorKit rtfEditorKit = new RTFEditorKit();
        rtfEditorKit.read(reader, dsd, 0);


        text = dsd.getText(0, dsd.getLength());
      }
      catch (Exception e) {
        _log.error(e.getMessage());
      }
    }
    else if (fileExt.equals(".xls")) {
      try {
        XLSTextStripper stripper = new XLSTextStripper(fis);


        text = stripper.getText();
      }
      catch (Exception e) {
        _log.error(e.getMessage());
      }
    }

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

          throw new RegainException("Document is encrypted and can't be opened: " + url);
        }
      }


      // Extract the text with a utility class
      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setSuppressDuplicateOverlappingText(false);
      stripper.setSortByPosition(true);
      stripper.setStartPage(1);
      stripper.setEndPage(Integer.MAX_VALUE);


      setCleanedContent(stripper.getText(pdfDocument).replaceAll("visiblespace", " "));


      // extract annotations
      StringBuilder annotsResult = new StringBuilder();
      List allPages = pdfDocument.getDocumentCatalog().getAllPages();
      for (int i = 0; i < allPages.size(); i++) {

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

            docTitle = MultiProtocolURI.unescape(location.getFileName());
        }
        CharBuffer writer = null;
        try {
            // create a writer for output
            PDFTextStripper stripper = null;
            writer = new CharBuffer();
            stripper = new PDFTextStripper();
            stripper.writeText(pdfDoc, writer); // may throw a NPE
            pdfDoc.close();
            writer.close();
        } catch (final IOException e) {
            // close the writer
            if (writer != null) try { writer.close(); } catch (final Exception ex) {}

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

    public void run() {
        try {
            String fileContent = "";
            File filePDF = new File(pathToFile);
            PDDocument pdDoc = PDDocument.load(new FileInputStream(filePDF));
            PDFTextStripper PDFTextStripper = null;


            Integer numberOfPages = pdDoc.getNumberOfPages();


            for (int page = 0; page < numberOfPages; page++) {
                PDFTextStripper = new PDFTextStripper("UTF-8");
                PDFTextStripper.setStartPage(page);
                PDFTextStripper.setEndPage(page);
                String text = PDFTextStripper.getText(pdDoc);
                fileContent += " " + text.replaceAll("\\s+", " ").trim();
                text = null;
            }


            pdDoc.close();

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }


                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

      }


      //create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      if (stripper == null) {
        stripper = new PDFTextStripper();
      } else {
        stripper.resetEngine();
      }
      stripper.writeText(pdfDocument, writer);

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

                else
                {
                    output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
                }


                PDFTextStripper stripper;
                if(toHTML)
                {
                    stripper = new PDFText2HTML();
                }
                else
                {
                    stripper = new PDFTextStripper();
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                if (debug) 
                {
                    System.err.println("Writing to "+outputFile);
                }
                
                // Extract text for main document:
                stripper.writeText( document, output );
                
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();    
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) 
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file != null && file.getSubtype().equals("application/pdf"))
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try 
                                    {
                                        subDoc = PDDocument.load(fis);
                                    } 
                                    finally 
                                    {
                                        fis.close();
                                    }
                                    try 
                                    {
                                        stripper.writeText( subDoc, output );
                                    } 
                                    finally 
                                    {
                                        subDoc.close();
                                    }

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

        contentStream.close();
        return doc;
    }


    public void testEscapeTitle() throws IOException {
        PDFTextStripper stripper = new PDFText2HTML();
        PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>");
        String text = stripper.getText(doc);
       
        Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text);
        assertTrue(m.find());
        assertEquals("&lt;script&gt;&#12354;", m.group(1));

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper


        assertTrue(text.indexOf("&lt;foo&gt;") >= 0);
    }


    public void testStyle() throws IOException {
        PDFTextStripper stripper = new PDFText2HTML();
        PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>");
        String text = stripper.getText(doc);


        Matcher bodyMatcher = Pattern.compile("<p>(.*?)</p>").matcher(text);
        assertTrue("body p exists", bodyMatcher.find());
        assertEquals("body p", "<b>&lt;bold&gt;</b>", bodyMatcher.group(1));
    }

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper


            // create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if (stripper == null)
            {
                stripper = new PDFTextStripper();
            }
            stripper.writeText(pdfDocument, writer);


            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.