Package org.pdfclown.samples.cli

Source Code of org.pdfclown.samples.cli.BasicTextExtractionSample

package org.pdfclown.samples.cli;

import org.pdfclown.documents.Document;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ContentScanner;
import org.pdfclown.documents.contents.fonts.Font;
import org.pdfclown.documents.contents.objects.ContainerObject;
import org.pdfclown.documents.contents.objects.ContentObject;
import org.pdfclown.documents.contents.objects.ShowText;
import org.pdfclown.documents.contents.objects.Text;
import org.pdfclown.files.File;

/**
  This sample demonstrates the <b>low-level way to extract text</b> from a PDF document.
  <h3>Remarks</h3>
  <p>In order to obtain richer information about the extracted text content,
  see the other available samples ({@link TextInfoExtractionSample}, {@link AdvancedTextExtractionSample}).</p>

  @author Stefano Chizzolini (http://www.stefanochizzolini.it)
  @since 0.0.8
  @version 0.1.0
*/
public class BasicTextExtractionSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice("Please select a PDF file");

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + " file access error.",e);}

    Document document = file.getDocument();

    // 2. Text extraction from the document pages.
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, false))
        return false;

      extract(
        new ContentScanner(page) // Wraps the page contents into a scanner.
        );
    }

    return true;
  }

  /**
    Scans a content level looking for text.
  */
  /*
    NOTE: Page contents are represented by a sequence of content objects,
    possibly nested into multiple levels.
  */
  private void extract(
    ContentScanner level
    )
  {
    if(level == null)
      return;

    while(level.moveNext())
    {
      ContentObject content = level.getCurrent();
      if(content instanceof ShowText)
      {
        Font font = level.getState().getFont();
        // Extract the current text chunk, decoding it!
        System.out.println(font.decode(((ShowText)content).getText()));
      }
      else if(content instanceof Text
        || content instanceof ContainerObject)
      {
        // Scan the inner level!
        extract(level.getChildLevel());
      }
    }
  }
}
TOP

Related Classes of org.pdfclown.samples.cli.BasicTextExtractionSample

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.