Package org.pdfbox.util

Examples of org.pdfbox.util.PDFTextStripper


         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }

         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try
View Full Code Here


         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }

         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try
View Full Code Here

    assertTrue(new File("arquivo.pdf").exists());
  }

  @Test
  public void testPDFWriterEscreveValorCorreto() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();

    PDDocument document = PDDocument.load(new File("arquivo.pdf"));
    String text = stripper.getText(document);
    document.close();
    assertTrue(text.contains("40,00"));
  }
View Full Code Here

    assertTrue(text.contains("40,00"));
  }

  @Test
  public void testPDFWriterEscreveLinhaDigitavelCorreta() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();

    PDDocument document = PDDocument.load(new File("arquivo.pdf"));
    String text = stripper.getText(document);
    document.close();

    assertTrue(text.contains("00190.00009  01207.113000  09000.206186  5  38600000004000"));
  }
View Full Code Here

        final String docPassword = "";
        if (document.isEncrypted()) {
            document.decrypt(docPassword);
        }

        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.shouldSeparateByBeads();
        stripper.shouldSortByPosition();

        return stripper.getText(document);
        }
View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), DEFAULT_ENCODING);
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
       if(document != null)
          document.close();
     
     } catch (Exception e) {
View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
       if(document != null)
          document.close();
     
     } catch (Exception e) {
View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
     
     
     } catch (Exception e) {
         throw new ExtractionException("failed to extract pdf (probable password protected document)",e,logger);
View Full Code Here

    {
        boolean toConsole = false;
        int currentArgumentIndex = 0;
        String password = "";
        String encoding = DEFAULT_ENCODING;
        PDFTextStripper stripper = new PDFTextStripper();
        String pdfFile = null;
        String textFile = null;
        for( int i=0; i<args.length; i++ )
        {
            if( args[i].equals( PASSWORD ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                password = args[i];
            }
            else if( args[i].equals( ENCODING ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                encoding = args[i];
            }
            else if( args[i].equals( CONSOLE ) )
            {
                toConsole = true;
            }
            else
            {
                if( pdfFile == null )
                {
                    pdfFile = args[i];
                }
                else
                {
                    textFile = args[i];
                }
            }
        }

        if( pdfFile == null )
        {
            usage();
        }

        if( textFile == null && pdfFile.length() >4 )
        {
            textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
        }

        InputStream input = null;
        Writer output = null;
        COSDocument document = null;
        try
        {
            input = new FileInputStream( pdfFile );
            long start = System.currentTimeMillis();
            document = parseDocument( input );
            long stop = System.currentTimeMillis();
            LOG.info( "Time to parse time=" + (stop-start) );


            //document.print();
            if( document.isEncrypted() )
            {
                try
                {
                    DecryptDocument decryptor = new DecryptDocument( document );
                    decryptor.decryptDocument( password );
                }
                catch( InvalidPasswordException e )
                {
                    if( args.length == 4 )//they supplied the wrong password
                    {
                        System.err.println( "Error: The supplied password is incorrect." );
                        System.exit( 2 );
                    }
                    else
                    {
                        //they didn't suppply a password and the default of "" was wrong.
                        System.err.println( "Error: The document is encrypted." );
                        usage();
                    }
                }
            }
            if( toConsole )
            {
                output = new OutputStreamWriter( System.out );
            }
            else
            {
                output = new OutputStreamWriter(
                    new FileOutputStream( textFile ), encoding );
            }

            start = System.currentTimeMillis();
            stripper.writeText( document, output );
            stop = System.currentTimeMillis();
            LOG.info( "Time to extract text time=" +(stop-start) );
        }
        finally
        {
View Full Code Here

            }

            //create a tmp output stream with the size of the content.
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            OutputStreamWriter writer = new OutputStreamWriter( out );
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.writeText( pdfDocument.getDocument(), writer );
            writer.close();

            byte[] contents = out.toByteArray();
            InputStreamReader input = new InputStreamReader( new ByteArrayInputStream( contents ) );
            // Add the tag-stripped contents as a Reader-valued Text field so it will
View Full Code Here

TOP

Related Classes of org.pdfbox.util.PDFTextStripper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.