Package org.apache.pdfbox.util

Examples of org.apache.pdfbox.util.PDFTextStripper$WordSeparator


                        output = new OutputStreamWriter(
                                new FileOutputStream( outputFile ) );
                    }
                }

                PDFTextStripper stripper = null;
                if(toHTML)
                {
                    stripper = new PDFText2HTML(encoding);
                }
                else
                {
                    stripper = new PDFTextStripper(encoding);
                }
                stripper.setSortByPosition( sort );
                stripper.setStartPage( startPage );
                stripper.setEndPage( endPage );
                stripper.writeText( document, output );
            }
            finally
            {
                if( output != null )
                {
View Full Code Here


            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if( stripper == null )
            {
                stripper = new PDFTextStripper();
            }
            else
            {
                stripper.resetEngine();
            }
View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }

                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try
View Full Code Here

         catch (IOException e)
         {
            throw new DocumentReadException("Can not load PDF document.", e);
         }

         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try
View Full Code Here

                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }

                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try
View Full Code Here

            docTitle = MultiProtocolURI.unescape(location.getFileName());
        }
        final CharBuffer writer = new CharBuffer();
        try {
            // create a writer for output
            final PDFTextStripper  stripper = new PDFTextStripper();
            // we start the pdf parsing in a separate thread to ensure that it can be terminated
            final Thread t = new Thread() {
                public void run() {
                    try {
                        stripper.writeText(pdfDoc, writer); // may throw a NPE
                    } catch (final Throwable e) {}
                }
            };
            t.start();
            t.join(3000);
View Full Code Here

        {
            boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false);

            // get input stream from bitstream
            // pass to filter, get string back
            PDFTextStripper pts = new PDFTextStripper();
            PDDocument pdfDoc = null;
            Writer writer = null;
            File tempTextFile = null;
            ByteArrayOutputStream byteStream = null;

            if (useTemporaryFile)
            {
                tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt");
                tempTextFile.deleteOnExit();
                writer = new OutputStreamWriter(new FileOutputStream(tempTextFile));
            }
            else
            {
                byteStream = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(byteStream);
            }
           
            try
            {
                pdfDoc = PDDocument.load(source);
                pts.writeText(pdfDoc, writer);
            }
            finally
            {
                try
                {
View Full Code Here

      e.printStackTrace();
    }
  }

  private void temp() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();
    String text = stripper.getText(doc);
    log.info(text);
  }
View Full Code Here

  public static List<String> readLines(File file) {
    COSDocument cosDoc = null;
    PDDocument pdDoc = null;

    try {
      PDFTextStripper pdfStripper = new PDFTextStripper();
      pdfStripper.setStartPage(1);
      pdfStripper.setEndPage(1);

      FileInputStream input = new FileInputStream(file);

      PDFParser parser = new PDFParser(input);
      parser.parse();
      cosDoc = parser.getDocument();
      pdDoc = new PDDocument(cosDoc);

      String text = pdfStripper.getText(pdDoc);
      text = text.replaceAll("\t", " ");
      Iterable<String> lines = Splitter.on("\n").split(text);
      return ImmutableList.copyOf(lines);
    } catch (IOException e) {
      return ImmutableList.of();
View Full Code Here

      PDFParser parser = new PDFParser(source);
      parser.parse();

      PDDocument document = parser.getPDDocument();
      PDFTextStripper stripper = new PDFTextStripper();

      String text = stripper.getText(document);

      document.close();

      return text;
    } catch (IOException e) {
View Full Code Here

TOP

Related Classes of org.apache.pdfbox.util.PDFTextStripper$WordSeparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.