Examples of PDFTextStripper

com.dotcms.repackage.org.apache.pdfbox.util.PDFTextStripper
org.apache.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it is up to clients of this class to verify that a specific user has the correct permissions to extract text from the PDF document. The basic flow of this process is that we get a document and use a series of processXXX() functions that work on smaller and smaller chunks of the page. Eventually, we fully process each page and then print it. @author Ben Litchfield
org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

Examples of org.apache.pdfbox.util.PDFTextStripper

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                if (debug) 
                {
                    System.err.println("Writing to "+outputFile);
                }
                
                // Extract text for main document:
                stripper.writeText( document, output );
                
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();    
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) 
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file.getSubtype().equals("application/pdf")) 
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try 
                                    {
                                        subDoc = PDDocument.load(fis);
                                    } 
                                    finally 
                                    {
                                        fis.close();
                                    }
                                    try 
                                    {
                                        stripper.writeText( subDoc, output );
                                    } 
                                    finally 
                                    {
                                        subDoc.close();
                                    }

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

        {
                isris = false;
                document = PDDocument.load(file);
                PDDocumentInformation infos = document.getDocumentInformation();


                PDFTextStripper textStripper=new PDFTextStripper();


                pdfletter = new Integer(pdftextletter.getText());
                if( pdfletter < textStripper.getText(document).length())
                {
                    pdftext = textStripper.getText(document).substring(0, pdfletter);
                }
                else
                {
                    pdftext = textStripper.getText(document).substring(0, textStripper.getText(document).length());
                }
                pdfabstract.setText(pdftext);


                if( document != null )
                {

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                if (debug) 
                {
                    System.err.println("Writing to "+outputFile);
                }
                
                // Extract text for main document:
                stripper.writeText( document, output );
                
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();    
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,Object> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,Object> ent : embeddedFileNames.entrySet()) 
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file.getSubtype().equals("application/pdf")) 
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try 
                                    {
                                        subDoc = PDDocument.load(fis);
                                    } 
                                    finally 
                                    {
                                        fis.close();
                                    }
                                    try 
                                    {
                                        stripper.writeText( subDoc, output );
                                    } 
                                    finally 
                                    {
                                        subDoc.close();
                                    }

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                stripper.writeText( document, output );
                stopProcessing("Time for extraction: ", startTime);
            }
            finally
            {
                if( output != null )

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

        }
    }


    public static String extractTextFrom(PDDocument document) throws IOException {
        Writer output = new StringWriter();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(document, output);
        return output.toString().trim();
    }

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try

View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper


    PDFParser parser = new PDFParser(source);
    parser.parse();


    PDDocument document = parser.getPDDocument();
    PDFTextStripper stripper = new PDFTextStripper();


    String text = stripper.getText(document);


    document.close();


    return text;
  }

View Full Code Here

Examples of org.pdfbox.util.PDFTextStripper

    
    return super.parseProperty( key, values, metadata );
  }


  public PdfDocumentFactory() throws IOException {
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

Examples of org.pdfbox.util.PDFTextStripper

    this.wordReader = new FastBufferedReader();
  }
  
  public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {
    super( properties );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

Examples of org.pdfbox.util.PDFTextStripper

    this.wordReader = new FastBufferedReader();
  }


  public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {
    super( defaultMetadata );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.