Package org.apache.pdfbox.util

Examples of org.apache.pdfbox.util.PDFTextStripper$WordSeparator


          throw new RegainException("Document is encrypted and can't be opened: " + url);
        }
      }

      // Extract the text with a utility class
      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setSuppressDuplicateOverlappingText(false);
      stripper.setSortByPosition(true);
      stripper.setStartPage(1);
      stripper.setEndPage(Integer.MAX_VALUE);

      setCleanedContent(stripper.getText(pdfDocument).replaceAll("visiblespace", " "));

      // extract annotations
      StringBuilder annotsResult = new StringBuilder();
      List allPages = pdfDocument.getDocumentCatalog().getAllPages();
      for (int i = 0; i < allPages.size(); i++) {
View Full Code Here


            docTitle = MultiProtocolURI.unescape(location.getFileName());
        }
        CharBuffer writer = null;
        try {
            // create a writer for output
            PDFTextStripper stripper = null;
            writer = new CharBuffer();
            stripper = new PDFTextStripper();
            stripper.writeText(pdfDoc, writer); // may throw a NPE
            pdfDoc.close();
            writer.close();
        } catch (final IOException e) {
            // close the writer
            if (writer != null) try { writer.close(); } catch (final Exception ex) {}
View Full Code Here

    public void run() {
        try {
            String fileContent = "";
            File filePDF = new File(pathToFile);
            PDDocument pdDoc = PDDocument.load(new FileInputStream(filePDF));
            PDFTextStripper PDFTextStripper = null;

            Integer numberOfPages = pdDoc.getNumberOfPages();

            for (int page = 0; page < numberOfPages; page++) {
                PDFTextStripper = new PDFTextStripper("UTF-8");
                PDFTextStripper.setStartPage(page);
                PDFTextStripper.setEndPage(page);
                String text = PDFTextStripper.getText(pdDoc);
                fileContent += " " + text.replaceAll("\\s+", " ").trim();
                text = null;
            }

            pdDoc.close();
View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }

                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try
View Full Code Here

      }

      //create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      if (stripper == null) {
        stripper = new PDFTextStripper();
      } else {
        stripper.resetEngine();
      }
      stripper.writeText(pdfDocument, writer);
View Full Code Here

                else
                {
                    output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
                }

                PDFTextStripper stripper;
                if(toHTML)
                {
                    stripper = new PDFText2HTML();
                }
                else
                {
                    stripper = new PDFTextStripper();
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );

                startTime = startProcessing("Starting text extraction");
                if (debug)
                {
                    System.err.println("Writing to "+outputFile);
                }
               
                // Extract text for main document:
                stripper.writeText( document, output );
               
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();   
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet())
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file != null && file.getSubtype().equals("application/pdf"))
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try
                                    {
                                        subDoc = PDDocument.load(fis);
                                    }
                                    finally
                                    {
                                        fis.close();
                                    }
                                    try
                                    {
                                        stripper.writeText( subDoc, output );
                                    }
                                    finally
                                    {
                                        subDoc.close();
                                    }
View Full Code Here

        contentStream.close();
        return doc;
    }

    public void testEscapeTitle() throws IOException {
        PDFTextStripper stripper = new PDFText2HTML();
        PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>");
        String text = stripper.getText(doc);
      
        Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text);
        assertTrue(m.find());
        assertEquals("&lt;script&gt;&#12354;", m.group(1));

View Full Code Here

        assertTrue(text.indexOf("&lt;foo&gt;") >= 0);
    }

    public void testStyle() throws IOException {
        PDFTextStripper stripper = new PDFText2HTML();
        PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>");
        String text = stripper.getText(doc);

        Matcher bodyMatcher = Pattern.compile("<p>(.*?)</p>").matcher(text);
        assertTrue("body p exists", bodyMatcher.find());
        assertEquals("body p", "<b>&lt;bold&gt;</b>", bodyMatcher.group(1));
    }
View Full Code Here

            // create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if (stripper == null)
            {
                stripper = new PDFTextStripper();
            }
            stripper.writeText(pdfDocument, writer);

            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }

                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );

                startTime = startProcessing("Starting text extraction");
                if (debug)
                {
                    System.err.println("Writing to "+outputFile);
                }
               
                // Extract text for main document:
                stripper.writeText( document, output );
               
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();   
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet())
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file.getSubtype().equals("application/pdf"))
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try
                                    {
                                        subDoc = PDDocument.load(fis);
                                    }
                                    finally
                                    {
                                        fis.close();
                                    }
                                    try
                                    {
                                        stripper.writeText( subDoc, output );
                                    }
                                    finally
                                    {
                                        subDoc.close();
                                    }
View Full Code Here

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripper$WordSeparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.