Examples of org.pdfbox.util.PDFTextStripper

org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try

View Full Code Here

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try

View Full Code Here

    assertTrue(new File("arquivo.pdf").exists());
  }


  @Test
  public void testPDFWriterEscreveValorCorreto() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();


    PDDocument document = PDDocument.load(new File("arquivo.pdf"));
    String text = stripper.getText(document);
    document.close();
    assertTrue(text.contains("40,00"));
  }

View Full Code Here

    assertTrue(text.contains("40,00"));
  }


  @Test
  public void testPDFWriterEscreveLinhaDigitavelCorreta() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();


    PDDocument document = PDDocument.load(new File("arquivo.pdf"));
    String text = stripper.getText(document);
    document.close();


    assertTrue(text.contains("00190.00009  01207.113000  09000.206186  5  38600000004000"));
  }

View Full Code Here

        final String docPassword = "";
        if (document.isEncrypted()) {
            document.decrypt(docPassword);
        }


        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.shouldSeparateByBeads();
        stripper.shouldSortByPosition();


        return stripper.getText(document);
        }

View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), DEFAULT_ENCODING);
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
       if(document != null)
          document.close();
      
     } catch (Exception e) {

View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
       if(document != null)
          document.close();
      
     } catch (Exception e) {

View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
      
      
     } catch (Exception e) {
         throw new ExtractionException("failed to extract pdf (probable password protected document)",e,logger);

View Full Code Here

    {
        boolean toConsole = false;
        int currentArgumentIndex = 0;
        String password = "";
        String encoding = DEFAULT_ENCODING;
        PDFTextStripper stripper = new PDFTextStripper();
        String pdfFile = null;
        String textFile = null;
        for( int i=0; i<args.length; i++ )
        {
            if( args[i].equals( PASSWORD ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                password = args[i];
            }
            else if( args[i].equals( ENCODING ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                encoding = args[i];
            }
            else if( args[i].equals( CONSOLE ) )
            {
                toConsole = true;
            }
            else
            {
                if( pdfFile == null )
                {
                    pdfFile = args[i];
                }
                else
                {
                    textFile = args[i];
                }
            }
        }


        if( pdfFile == null )
        {
            usage();
        }


        if( textFile == null && pdfFile.length() >4 )
        {
            textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
        }


        InputStream input = null;
        Writer output = null;
        COSDocument document = null;
        try
        {
            input = new FileInputStream( pdfFile );
            long start = System.currentTimeMillis();
            document = parseDocument( input );
            long stop = System.currentTimeMillis();
            LOG.info( "Time to parse time=" + (stop-start) );




            //document.print();
            if( document.isEncrypted() )
            {
                try
                {
                    DecryptDocument decryptor = new DecryptDocument( document );
                    decryptor.decryptDocument( password );
                }
                catch( InvalidPasswordException e )
                {
                    if( args.length == 4 )//they supplied the wrong password
                    {
                        System.err.println( "Error: The supplied password is incorrect." );
                        System.exit( 2 );
                    }
                    else
                    {
                        //they didn't suppply a password and the default of "" was wrong.
                        System.err.println( "Error: The document is encrypted." );
                        usage();
                    }
                }
            }
            if( toConsole )
            {
                output = new OutputStreamWriter( System.out );
            }
            else
            {
                output = new OutputStreamWriter(
                    new FileOutputStream( textFile ), encoding );
            }


            start = System.currentTimeMillis();
            stripper.writeText( document, output );
            stop = System.currentTimeMillis();
            LOG.info( "Time to extract text time=" +(stop-start) );
        }
        finally
        {

View Full Code Here

            }


            //create a tmp output stream with the size of the content.
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            OutputStreamWriter writer = new OutputStreamWriter( out );
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.writeText( pdfDocument.getDocument(), writer );
            writer.close();


            byte[] contents = out.toByteArray();
            InputStreamReader input = new InputStreamReader( new ByteArrayInputStream( contents ) );
            // Add the tag-stripped contents as a Reader-valued Text field so it will

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.pdfbox.util.PDFTextStripper

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox.PdfBoxPDFPage

com.stimulus.archiva.extraction.PDFExtractor

com.stimulus.archiva.persistence.textextraction.PDFExtractor

de.spotnik.mail.core.message.content.PDFHandler

edu.udo.cs.wvtool.generic.inputfilter.PDFInputFilter

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

eu.planets_project.services.migration.pdfbox.TextExtractor

it.unimi.dsi.mg4j.document.PdfDocumentFactory

net.fp.rp.search.back.extractor.PdfDataExtractor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.