Examples of org.apache.pdfbox.util.PDFTextStripper

org.apache.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it is up to clients of this class to verify that a specific user has the correct permissions to extract text from the PDF document. The basic flow of this process is that we get a document and use a series of processXXX() functions that work on smaller and smaller chunks of the page. Eventually, we fully process each page and then print it. @author Ben Litchfield

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setSortByPosition( sort );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );
                stripper.writeText( document, output );
            }
            finally
            {
                if( output != null )
                {

View Full Code Here


            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if( stripper == null )
            {
                stripper = new PDFTextStripper();
            }
            else
            {
                stripper.resetEngine();
            }

View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }


                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try

View Full Code Here

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try

View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }


                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try

View Full Code Here

            docTitle = MultiProtocolURI.unescape(location.getFileName());
        }
        final CharBuffer writer = new CharBuffer();
        try {
            // create a writer for output
            final PDFTextStripper  stripper = new PDFTextStripper();
            // we start the pdf parsing in a separate thread to ensure that it can be terminated
            final Thread t = new Thread() {
                public void run() {
                    try {
                        stripper.writeText(pdfDoc, writer); // may throw a NPE
                    } catch (final Throwable e) {}
                }
            };
            t.start();
            t.join(3000);

View Full Code Here

        {
            boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false);


            // get input stream from bitstream
            // pass to filter, get string back
            PDFTextStripper pts = new PDFTextStripper();
            PDDocument pdfDoc = null;
            Writer writer = null;
            File tempTextFile = null;
            ByteArrayOutputStream byteStream = null;


            if (useTemporaryFile)
            {
                tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt");
                tempTextFile.deleteOnExit();
                writer = new OutputStreamWriter(new FileOutputStream(tempTextFile));
            }
            else
            {
                byteStream = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(byteStream);
            }
            
            try
            {
                pdfDoc = PDDocument.load(source);
                pts.writeText(pdfDoc, writer);
            }
            finally
            {
                try
                {

View Full Code Here

      e.printStackTrace();
    }
  }


  private void temp() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();
    String text = stripper.getText(doc);
    log.info(text);
  }

View Full Code Here

  public static List<String> readLines(File file) {
    COSDocument cosDoc = null;
    PDDocument pdDoc = null;


    try {
      PDFTextStripper pdfStripper = new PDFTextStripper();
      pdfStripper.setStartPage(1);
      pdfStripper.setEndPage(1);


      FileInputStream input = new FileInputStream(file);


      PDFParser parser = new PDFParser(input);
      parser.parse();
      cosDoc = parser.getDocument();
      pdDoc = new PDDocument(cosDoc);


      String text = pdfStripper.getText(pdDoc);
      text = text.replaceAll("\t", " ");
      Iterable<String> lines = Splitter.on("\n").split(text);
      return ImmutableList.copyOf(lines);
    } catch (IOException e) {
      return ImmutableList.of();

View Full Code Here


      PDFParser parser = new PDFParser(source);
      parser.parse();


      PDDocument document = parser.getPDDocument();
      PDFTextStripper stripper = new PDFTextStripper();


      String text = stripper.getText(document);


      document.close();


      return text;
    } catch (IOException e) {

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripper

br.com.objectos.way.reports.htmltopdf.Pdfs

br.net.woodstock.rockframework.document.pdf.pdfbox.GetTextProcessor

br.net.woodstock.rockframework.office.pdf.impl.PDFBoxManager

com.cardence.lawshelf.pdf.PdfBoxParserImpl

com.gentics.cr.lucene.indexer.transformer.pdf.PDFContentTransformer

com.google.code.ftspc.lector.parsers.PDF.PDFParserLocal

com.stimulus.archiva.extraction.PDFExtractor

de.pdf_scrutinizer.document.DocumentAdapter

geopms.GeoPMSImportPDF

ir.Indexer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.