Examples of PDFTextStripper


Examples of org.apache.pdfbox.util.PDFTextStripper

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }

                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );

                startTime = startProcessing("Starting text extraction");
                if (debug)
                {
                    System.err.println("Writing to "+outputFile);
                }
               
                // Extract text for main document:
                stripper.writeText( document, output );
               
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();   
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet())
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file.getSubtype().equals("application/pdf"))
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try
                                    {
                                        subDoc = PDDocument.load(fis);
                                    }
                                    finally
                                    {
                                        fis.close();
                                    }
                                    try
                                    {
                                        stripper.writeText( subDoc, output );
                                    }
                                    finally
                                    {
                                        subDoc.close();
                                    }
View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

        {
                isris = false;
                document = PDDocument.load(file);
                PDDocumentInformation infos = document.getDocumentInformation();

                PDFTextStripper textStripper=new PDFTextStripper();

                pdfletter = new Integer(pdftextletter.getText());
                if( pdfletter < textStripper.getText(document).length())
                {
                    pdftext = textStripper.getText(document).substring(0, pdfletter);
                }
                else
                {
                    pdftext = textStripper.getText(document).substring(0, textStripper.getText(document).length());
                }
                pdfabstract.setText(pdftext);

                if( document != null )
                {
View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }

                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );

                startTime = startProcessing("Starting text extraction");
                if (debug)
                {
                    System.err.println("Writing to "+outputFile);
                }
               
                // Extract text for main document:
                stripper.writeText( document, output );
               
                // ... also for any embedded PDFs:
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentNameDictionary names = catalog.getNames();   
                if (names != null)
                {
                    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                    if (embeddedFiles != null)
                    {
                        Map<String,Object> embeddedFileNames = embeddedFiles.getNames();
                        if (embeddedFileNames != null) {
                            for (Map.Entry<String,Object> ent : embeddedFileNames.entrySet())
                            {
                                if (debug)
                                {
                                    System.err.println("Processing embedded file " + ent.getKey() + ":");
                                }
                                PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                                PDEmbeddedFile file = spec.getEmbeddedFile();
                                if (file.getSubtype().equals("application/pdf"))
                                {
                                    if (debug)
                                    {
                                        System.err.println("  is PDF (size=" + file.getSize() + ")");
                                    }
                                    InputStream fis = file.createInputStream();
                                    PDDocument subDoc = null;
                                    try
                                    {
                                        subDoc = PDDocument.load(fis);
                                    }
                                    finally
                                    {
                                        fis.close();
                                    }
                                    try
                                    {
                                        stripper.writeText( subDoc, output );
                                    }
                                    finally
                                    {
                                        subDoc.close();
                                    }
View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }

                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setForceParsing( force );
                stripper.setSortByPosition( sort );
                stripper.setShouldSeparateByBeads( separateBeads );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );

                startTime = startProcessing("Starting text extraction");
                stripper.writeText( document, output );
                stopProcessing("Time for extraction: ", startTime);
            }
            finally
            {
                if( output != null )
View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

        }
    }

    public static String extractTextFrom(PDDocument document) throws IOException {
        Writer output = new StringWriter();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(document, output);
        return output.toString().trim();
    }
View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }

         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try
View Full Code Here

Examples of org.apache.pdfbox.util.PDFTextStripper

    PDFParser parser = new PDFParser(source);
    parser.parse();

    PDDocument document = parser.getPDDocument();
    PDFTextStripper stripper = new PDFTextStripper();

    String text = stripper.getText(document);

    document.close();

    return text;
  }
View Full Code Here

Examples of org.pdfbox.util.PDFTextStripper

   
    return super.parseProperty( key, values, metadata );
  }

  public PdfDocumentFactory() throws IOException {
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here

Examples of org.pdfbox.util.PDFTextStripper

    this.wordReader = new FastBufferedReader();
  }
 
  public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {
    super( properties );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here

Examples of org.pdfbox.util.PDFTextStripper

    this.wordReader = new FastBufferedReader();
  }

  public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {
    super( defaultMetadata );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.