Examples of org.pdfbox.util.PDFTextStripper

org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

     */
    public void doTestFile(File file, boolean bLogResult)
        throws Exception
    {


        PDFTextStripper stripper = new PDFTextStripper();
        OutputStream os = null;
        Writer writer = null;
        PDDocument document = null;
        try
        {
            document = PDDocument.load(file);


            File outFile = new File(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt");
            os = new FileOutputStream(outFile);
            writer = new OutputStreamWriter(os);


            stripper.writeText(document, writer);
        }
        finally
        {
            if( writer != null )
            {

View Full Code Here

                        output = new OutputStreamWriter(
                            new FileOutputStream( textFile ) );
                    }
                }
    
                PDFTextStripper stripper = null;
                if(toHTML) 
                {
                   stripper = new PDFText2HTML();
                } 
                else 
                {
                   stripper = new PDFTextStripper();
                }
                stripper.setSortByPosition( sort );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );
                stripper.writeText( document, output );
            }
            finally
            {
                if( output != null )
                {

View Full Code Here


                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();


                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);


                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }

View Full Code Here


                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();


                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);


                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }

View Full Code Here

        //Just try using the default password and move on
        pdf.openProtection(new StandardDecryptionMaterial(""));
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here

   * @return Reader a reader that is fed to an indexer.
   */
  protected Reader getReader(InputStream docStream)
  {
    
    PDFParser parser = null; PDDocument document = null; PDFTextStripper stripper = null;
    CharArrayWriter writer = null;
    try{
      parser = new PDFParser(docStream);
      parser.parse();
      document = parser.getPDDocument();
      writer = new CharArrayWriter();
      stripper = new PDFTextStripper();
      stripper.setLineSeparator("\n");
      stripper.writeText(document, writer);
      document.close();
      writer.close();
      parser.getDocument().close();
      return new CharArrayReader(writer.toCharArray());
    }catch (Exception e){

View Full Code Here

        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here

         catch (IOException e)
         {
            return "";
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try

View Full Code Here

    
                PDDocument document = parser.getPDDocument();
    
                CharArrayWriter writer = new CharArrayWriter();
    
                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);
    
                document.close();
                writer.close();
                
                Map result = new HashMap();

View Full Code Here

        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.pdfbox.util.PDFTextStripper

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox.PdfBoxPDFPage

com.stimulus.archiva.extraction.PDFExtractor

com.stimulus.archiva.persistence.textextraction.PDFExtractor

de.spotnik.mail.core.message.content.PDFHandler

edu.udo.cs.wvtool.generic.inputfilter.PDFInputFilter

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

eu.planets_project.services.migration.pdfbox.TextExtractor

it.unimi.dsi.mg4j.document.PdfDocumentFactory

net.fp.rp.search.back.extractor.PdfDataExtractor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.