Examples of org.apache.pdfbox.util.PDFTextStripper$WordWithTextPositions

org.apache.pdfbox.util.PDFTextStripper
Internal class that maps strings to lists of {@link TextPosition} arrays.Note that the number of entries in that list may differ from the number of characters in the string due to normalization. @author Axel D�rfler

        {
                isris = false;
                document = PDDocument.load(file);
                PDDocumentInformation infos = document.getDocumentInformation();


                PDFTextStripper textStripper=new PDFTextStripper();


                pdfletter = new Integer(pdftextletter.getText());
                if( pdfletter < textStripper.getText(document).length())
                {
                    pdftext = textStripper.getText(document).substring(0, pdfletter);
                }
                else
                {
                    pdftext = textStripper.getText(document).substring(0, textStripper.getText(document).length());
                }
                pdfabstract.setText(pdftext);


                if( document != null )
                {

View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                if (debug) 
                {
                    System.err.println("Writing to "+outputFile);
                }
                
                // Extract text for main document:
                stripper.writeText( document, output );
                
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();    
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,Object> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,Object> ent : embeddedFileNames.entrySet()) 
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file.getSubtype().equals("application/pdf")) 
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try 
                                    {
                                        subDoc = PDDocument.load(fis);
                                    } 
                                    finally 
                                    {
                                        fis.close();
                                    }
                                    try 
                                    {
                                        stripper.writeText( subDoc, output );
                                    } 
                                    finally 
                                    {
                                        subDoc.close();
                                    }

View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                stripper.writeText( document, output );
                stopProcessing("Time for extraction: ", startTime);
            }
            finally
            {
                if( output != null )

View Full Code Here

        }
    }


    public static String extractTextFrom(PDDocument document) throws IOException {
        Writer output = new StringWriter();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(document, output);
        return output.toString().trim();
    }

View Full Code Here

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try

View Full Code Here


    PDFParser parser = new PDFParser(source);
    parser.parse();


    PDDocument document = parser.getPDDocument();
    PDFTextStripper stripper = new PDFTextStripper();


    String text = stripper.getText(document);


    document.close();


    return text;
  }

View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );


                startTime = startProcessing("Starting text extraction");
                if (debug) 
                {
                    System.err.println("Writing to "+outputFile);
                }
                
                // Extract text for main document:
                stripper.writeText( document, output );
                
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();    
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) 
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file != null && file.getSubtype().equals("application/pdf")) 
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try 
                                    {
                                        subDoc = PDDocument.load(fis);
                                    } 
                                    finally 
                                    {
                                        fis.close();
                                    }
                                    try 
                                    {
                                        stripper.writeText( subDoc, output );
                                    } 
                                    finally 
                                    {
                                        subDoc.close();
                                    }

View Full Code Here


            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if( stripper == null )
            {
                stripper = new PDFTextStripper();
            }
            else
            {
                stripper.resetEngine();
            }

View Full Code Here


            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if( stripper == null )
            {
                stripper = new PDFTextStripper();
            }
            else
            {
                stripper.resetEngine();
            }

View Full Code Here

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }


                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );
                stripper.writeText( document, output );
            }
            finally
            {
                if( output != null )
                {

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripper$WordWithTextPositions

br.com.objectos.way.reports.htmltopdf.Pdfs

br.net.woodstock.rockframework.document.pdf.pdfbox.GetTextProcessor

br.net.woodstock.rockframework.office.pdf.impl.PDFBoxManager

com.cardence.lawshelf.pdf.PdfBoxParserImpl

com.gentics.cr.lucene.indexer.transformer.pdf.PDFContentTransformer

com.google.code.ftspc.lector.parsers.PDF.PDFParserLocal

com.stimulus.archiva.extraction.PDFExtractor

de.pdf_scrutinizer.document.DocumentAdapter

geopms.GeoPMSImportPDF

ir.Indexer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.