Examples of org.pdfbox.util.PDFTextStripper

org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

        final List fragments = new ArrayList();


        final StringWriter output = new StringWriter();
        try 
        {
            final PDFTextStripper textStripper = new PDFTextStripper()
            {
                protected void showCharacter(TextPosition textPosition) {
                    fragments.add(textPosition);
                }
            };
            textStripper.setLineSeparator(lineSeparator);
            textStripper.setStartPage(page);
            textStripper.setEndPage(page);
            textStripper.writeText(getPDFDocument(), output);
            return fragments;
        }
        catch (final Exception e) 
        {
            throw new RuntimeException("Error while extracting text from document.", e);

View Full Code Here

        this.contentHandler.startPrefixMapping(PREFIX, NAMESPACE);
        this.contentHandler.startElement(NAMESPACE, "document", PREFIX + ":document",
                new AttributesImpl());


        try {
            PDFTextStripper stripper = new PDFTextStripper();
            PDFParser parser = new PDFParser(this.content.getInputStream());
            parser.parse();
            PDDocument doc = parser.getPDDocument();
            String text = stripper.getText(doc);
            doc.close();
            char[] chars = text.toCharArray();
            this.contentHandler.characters(chars, 0, chars.length);
        } catch (Exception e) {
            throw new ProcessingException(e);

View Full Code Here

            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);


                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();

View Full Code Here

            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);


                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();

View Full Code Here


                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();


                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);


                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }

View Full Code Here

        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here

       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
      /*logger.debug("PDF extraction completed");
       BufferedReader reader;
       try {
         reader = new BufferedReader(new FileReader(file));

View Full Code Here

        {
            boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false);


            // get input stream from bitstream
            // pass to filter, get string back
            PDFTextStripper pts = new PDFTextStripper();
            PDDocument pdfDoc = null;
            Writer writer = null;
            File tempTextFile = null;
            ByteArrayOutputStream byteStream = null;


            if (useTemporaryFile)
            {
                tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt");
                tempTextFile.deleteOnExit();
                writer = new OutputStreamWriter(new FileOutputStream(tempTextFile));
            }
            else
            {
                byteStream = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(byteStream);
            }
            
            try
            {
                pdfDoc = PDDocument.load(source);
                pts.writeText(pdfDoc, writer);
            }
            finally
            {
                try
                {

View Full Code Here

                pdfDocument.decrypt( "" );
            }
            
            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.writeText( pdfDocument, writer );


            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
            // is shared as long as the buffer content is not modified, which will
            // not occur here.

View Full Code Here

        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.pdfbox.util.PDFTextStripper

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox.PdfBoxPDFPage

com.stimulus.archiva.extraction.PDFExtractor

com.stimulus.archiva.persistence.textextraction.PDFExtractor

de.spotnik.mail.core.message.content.PDFHandler

edu.udo.cs.wvtool.generic.inputfilter.PDFInputFilter

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

eu.planets_project.services.migration.pdfbox.TextExtractor

it.unimi.dsi.mg4j.document.PdfDocumentFactory

net.fp.rp.search.back.extractor.PdfDataExtractor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.