Package org.jpedal.examples.simpleviewer.utils

Source Code of org.jpedal.examples.simpleviewer.utils.Exporter

/**
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info:  http://www.jpedal.org
* (C) Copyright 1997-2008, IDRsolutions and Contributors.
*
*   This file is part of JPedal
*
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


*
* ---------------
* Exporter.java
* ---------------
*/
package org.jpedal.examples.simpleviewer.utils;

import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.imageio.ImageIO;

import javax.swing.ProgressMonitor;

import org.jpedal.PdfDecoder;

import org.jpedal.examples.simpleviewer.gui.SwingGUI;
import org.jpedal.examples.simpleviewer.gui.popups.SaveBitmap;

import org.jpedal.examples.simpleviewer.gui.popups.SaveImage;
import org.jpedal.examples.simpleviewer.gui.popups.SaveText;
import org.jpedal.objects.PdfImageData;

import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;

import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.gui.GUIFactory;

import org.jpedal.io.ColorSpaceConvertor;
import org.jpedal.io.JAIHelper;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Messages;
import org.jpedal.utils.Strip;
import org.jpedal.utils.SwingWorker;

/**provide save functions for SimpleViewer to write out text, images, etc*/
public class Exporter {
 
  final public static int RECTANGLE=1;
  final public static int WORDLIST=2;
  final public static int TABLE=3;
 
  /**file separator used*/
  private final String separator=System.getProperty( "file.separator" );
 
  private String fileName="";
 
  private GUIFactory currentGUI;
 
  private PdfDecoder dPDF;
 
  private String selectedFile;
 
  public Exporter(SwingGUI currentGUI,String selectedFile,PdfDecoder decode_pdf){
    String fileName = new File(selectedFile).getName();
    if(fileName.lastIndexOf('.') != -1)
      fileName = fileName.substring(0,fileName.lastIndexOf('.'));
   
    StringBuilder fileNameBuffer = new StringBuilder(fileName);
    int index;
    while((index = fileNameBuffer.toString().indexOf("%20")) != -1){
      fileNameBuffer.replace(index,index+3," ");
    }
   
    this.fileName = fileNameBuffer.toString();
    this.currentGUI=currentGUI;
    this.selectedFile=selectedFile;
    this.dPDF=decode_pdf;
   
  }
 
 
 
  /**save image - different versions have different bugs for file formats so we use best for
   * each image type
   * @param image_to_save
   */
  private static void saveImage(BufferedImage image_to_save, String fileName,String prefix) {

        if(JAIHelper.isJAIused())
        JAIHelper.confirmJAIOnClasspath();

        if(prefix.contains("tif") && JAIHelper.isJAIused()){

      try {
        FileOutputStream os = new FileOutputStream(fileName);
        javax.media.jai.JAI.create("encode", image_to_save, os, "TIFF", null);
      } catch (FileNotFoundException e) {
        e.printStackTrace();
      }
     
    }else{ //default
      try {
       
        ImageIO.write(image_to_save,prefix,new File(fileName));
       
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
  }

  /**
   * routine to write out clipped PDFs
   */
  private void decodeHires(int start,int end,String imageType,String output_dir){
   
    PdfDecoder decode_pdf=null;
   
    String target="";
   
    //PdfDecoder returns a PdfException if there is a problem
    try{
     
      decode_pdf = new PdfDecoder( false );
      decode_pdf.setExtractionMode(PdfDecoder.FINALIMAGES+PdfDecoder.CLIPPEDIMAGES,72,1);
     
      /** open the file (and read metadata including pages in  file)*/
      decode_pdf.openPdfFile( selectedFile );
     
    }catch( Exception e ){
      e.printStackTrace();
    }
   
    /**
     * extract data from pdf (if allowed).
     */
    if ((decode_pdf.isEncrypted()&&(!decode_pdf.isPasswordSupplied()))&&(!decode_pdf.isExtractionAllowed()))
      return;
   
    ProgressMonitor status = new ProgressMonitor(currentGUI.getFrame(),
        Messages.getMessage("PdfViewerMessage.ExtractImages"),"",start,end);
   
    try{
      int count=0;
      boolean yesToAll = false;
      for( int page = start;page < end + 1;page++ ){ //read pages
        if(status.isCanceled()){
          currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
              count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));
          return;
        }
        //decode the page
        decode_pdf.decodePage( page );
       
        //get the PdfImages object which now holds the images.
        //binary data is stored in a temp directory and we hold the
        //image name and other info in this object
        PdfImageData pdf_images = decode_pdf.getPdfImageData();
       
        //image count (note image 1 is item 0, so any loop runs 0 to count-1)
        int image_count = pdf_images.getImageCount();
       
        if(image_count>0){
          target=output_dir+page+separator;
          File targetExists=new File(target);
          if(!targetExists.exists())
            targetExists.mkdir();
        }
       
        //work through and save each image
        for( int i = 0;i < image_count;i++ ){
         
          String image_name =pdf_images.getImageName( i );
          BufferedImage image_to_save;
         
          float x1=pdf_images.getImageXCoord(i);
          float y1=pdf_images.getImageYCoord(i);
          float w=pdf_images.getImageWidth(i);
          float h=pdf_images.getImageHeight(i);
         
          try{
           
            image_to_save =decode_pdf.getObjectStore().loadStoredImage"CLIP_"+image_name );
           
            //save image

            if(image_to_save!=null){
             
              //remove transparency on jpeg
              if(imageType.toLowerCase().startsWith("jp"))
                image_to_save=ColorSpaceConvertor.convertToRGB(image_to_save);

              File fileToSave = new File(target+image_name+ '.' +imageType);
              if(fileToSave.exists() && !yesToAll){
                int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),true);
                       
                        if(n==0){
                          // clicked yes so just carry on for this once
                        }else if(n==1){
                          // clicked yes to all, so set flag
                          yesToAll = true;
                        }else if(n==2){
                          // clicked no, so loop round again
                          status.setProgress(page);
                          continue;
                        }else{
                         
                          currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                              count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));
                         
                          status.close();
                          return;
                        }
              }
             
              saveImage(image_to_save,target+image_name+ '.' +imageType,imageType);
              count++;
            }
           
            //save an xml file with details
            /**
             * output the data
             */
            //LogWriter.writeLog( "Writing out "+(outputName + ".xml"));
            OutputStreamWriter output_stream =
              new OutputStreamWriter(
                  new FileOutputStream(target+image_name + ".xml"),
              "UTF-8");
           
            output_stream.write(
            "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
            output_stream.write(
            "<!-- Pixel Location of image x1,y1,x2,y2\n");
            output_stream.write("(x1,y1 is top left corner)\n");
            output_stream.write(
            "(origin is bottom left corner)  -->\n");
            output_stream.write("\n\n<META>\n");
            output_stream.write(
                "<PAGELOCATION x1=\""+ x1+ "\" "
                + "y1=\""+ (y1+h)+ "\" "
                + "x2=\""+ (x1+w)+ "\" "
                + "y2=\""+ (y1)+ "\" />\n");
            output_stream.write("<FILE>"+this.fileName+"</FILE>\n");
            output_stream.write("</META>\n");
            output_stream.close();
          }catch( Exception ee ){
            ee.printStackTrace();
            LogWriter.writeLog( "Exception " + ee + " in extracting images" );
          }
        }
       
       
        //flush images in case we do more than 1 page so only contains
        //images from current page
        decode_pdf.flushObjectValues(true);
       
        status.setProgress(page+1);
       
      }
      status.close();
     
      currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.ImagesSavedTo")+ ' ' +output_dir);
     
     
    }catch( Exception e ){
      decode_pdf.closePdfFile();
      LogWriter.writeLog( "Exception " + e.getMessage() );
    }
   
   
   
    /**close the pdf file*/
    decode_pdf.closePdfFile();
   
  }
 
 
  public void extractImagesOnPages(SaveImage current_selection) {
    final int startPage = current_selection.getStartPage();
    final int endPage = current_selection.getEndPage();
   
    if(startPage < 1 || endPage < 1)
      return;
   
    final int type=current_selection.getImageType();
    //get user choice
    final String format = current_selection.getPrefix();
    final String output_dir = current_selection.getRootDir()+separator+fileName+separator+"images"+separator;
   
    File testDirExists=new File(output_dir);
    if(!testDirExists.exists())
      testDirExists.mkdirs();
   
    final SwingWorker worker = new SwingWorker() {
      public Object construct() {
        //do the save
       
        switch(type){
        case PdfDecoder.CLIPPEDIMAGES:
          decodeHires(startPage,endPage,format,output_dir);
          break;
        case PdfDecoder.RAWIMAGES:
          decodeImages(startPage,endPage,format,output_dir,false);
          break;
        case PdfDecoder.FINALIMAGES:
          decodeImages(startPage,endPage,format,output_dir,true);
          break;
        default:
          System.out.println("Unknown setting");
        break;
        }
       
        return null;
      }
    };
   
    worker.start();   
  }
 
 
  /**
   * routine to write out images in PDFs
   */
  private void decodeImages(int start,int end,String prefix,String output_dir,boolean downsampled){
   
    PdfDecoder decode_pdf=null;
   
    //PdfDecoder returns a PdfException if there is a problem
    try{
     
      decode_pdf = new PdfDecoder( false );
     
      decode_pdf.setExtractionMode(PdfDecoder.RAWIMAGES+PdfDecoder.FINALIMAGES,72,1);
      /** open the file (and read metadata including pages in  file)*/
      decode_pdf.openPdfFile( selectedFile );
     
    }catch( Exception e ){
      e.printStackTrace();
    }
   
    /**
     * extract data from pdf (if allowed).
     */
    if ((decode_pdf.isEncrypted()&&(!decode_pdf.isPasswordSupplied()))&&(!decode_pdf.isExtractionAllowed()))
      return;
   
    ProgressMonitor status = new ProgressMonitor(currentGUI.getFrame(),
        Messages.getMessage("PdfViewerMessage.ExtractImages"),"",start,end);
   
    try{
      int count=0;
      boolean yesToAll = false;
      for( int page = start;page < end + 1;page++ ){ //read pages
        if(status.isCanceled()){
          currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                            count + Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));
          return;
        }
        //decode the page
        decode_pdf.decodePage( page );
       
        //get the PdfImages object which now holds the images.
        //binary data is stored in a temp directory and we hold the
        //image name and other info in this object
        PdfImageData pdf_images = decode_pdf.getPdfImageData();
       
        //image count (note image 1 is item 0, so any loop runs 0 to count-1)
        int image_count = pdf_images.getImageCount();
       
        String target=output_dir+separator;
        if(downsampled)
          target=target+"downsampled"+separator+page+separator;
        else
          target=target+"normal"+separator+page+separator;
       
        //tell user
        if( image_count > 0 ){
         
         
          //create a directory for page
          File page_path = new File( target );
          if( page_path.exists() == false )
            page_path.mkdirs();
         
         
          //do it again as some OS struggle with creating nested dirs
          page_path = new File( target );
          if( page_path.exists() == false )
            page_path.mkdirs();
         
        }
       
        //work through and save each image
        for( int i = 0;i < image_count;i++ )
        {
          String image_name = pdf_images.getImageName( i );
          BufferedImage image_to_save;
         
          try
          {
            if(downsampled){
              //load processed version of image (converted to rgb)
              image_to_save = decode_pdf.getObjectStore().loadStoredImage( image_name );
              if(prefix.toLowerCase().startsWith("jp")){
                image_to_save=ColorSpaceConvertor.convertToRGB(image_to_save);
               
              }
            }else{
              //get raw version of image (R prefix for raw image)
              image_to_save = decode_pdf.getObjectStore().loadStoredImage( image_name );
              if(prefix.toLowerCase().startsWith("jp")){
                image_to_save=ColorSpaceConvertor.convertToRGB(image_to_save);
              }     
            }
           
            File fileToSave = new File(target+ image_name+ '.' +prefix);
            if(fileToSave.exists() && !yesToAll){
              int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),true);
                     
                      if(n==0){
                        // clicked yes so just carry on for this once
                      }else if(n==1){
                        // clicked yes to all, so set flag
                        yesToAll = true;
                      }else if(n==2){
                        // clicked no, so loop round again
                        status.setProgress(page);
                        continue;
                      }else{
                       
                        currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                            count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));
                       
                        status.close();
                        return;
                      }
            }
           
            //save image
            saveImage(image_to_save,target+ image_name+ '.' +prefix,prefix);
            count++;
          }
         
         
          catch( Exception ee )
          {
            System.err.println( "Exception " + ee + " in extracting images" );
          }
        }
       
        //flush images in case we do more than 1 page so only contains
        //images from current page
        decode_pdf.flushObjectValues(true);
       
       
        status.setProgress(page+1);
      }

      currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.ImagesSavedTo")+ ' ' +output_dir);
     
      status.close();
    }catch( Exception e ){
      decode_pdf.closePdfFile();
      LogWriter.writeLog( "Exception " + e.getMessage() );
    }
   
    /**close the pdf file*/
    decode_pdf.closePdfFile();
   
  }
 
 
  public void extractTextOnPages(SaveText current_selection) {
    //get user choice
    final int startPage = current_selection.getStartPage();
    final int endPage = current_selection.getEndPage();
   
    if(startPage < 1 || endPage < 1)
      return;
   
    final int type=current_selection.getTextType();
    final boolean useXMLExtraction=current_selection.isXMLExtaction();
   
    final String output_dir = current_selection.getRootDir()+separator+fileName+separator+"text"+separator;
   
    File testDirExists=new File(output_dir);
    if(!testDirExists.exists())
      testDirExists.mkdirs();
   
    final SwingWorker worker = new SwingWorker() {
      public Object construct() {
        //do the save
       
        switch(type){
        case Exporter.RECTANGLE:
          decodeTextRectangle(startPage,endPage,output_dir,useXMLExtraction);
          break;
        case Exporter.WORDLIST:
          decodeTextWordlist(startPage,endPage,output_dir,useXMLExtraction);
          break;
        case Exporter.TABLE:
          decodeTextTable(startPage,endPage,output_dir,useXMLExtraction);
         
          break;
        default:
          System.out.println("Unknown setting");
        break;
        }
       
        return null;
      }
    };
   
    worker.start()
   
  }
 
 
 
 
  private void decodeTextTable(int startPage, int endPage, String output_dir, boolean useXMLExtraction) {
   
    PdfDecoder decode_pdf=null;
   
    try {
      decode_pdf = new PdfDecoder(false);
      decode_pdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
     
      decode_pdf.init(true);
     
      /**
       * open the file (and read metadata including pages in  file)
       */
     
      decode_pdf.openPdfFile(selectedFile);
             
    } catch (Exception e) {
      System.err.println("Exception " + e + " in pdf code");
    }
   
    /**
     * extract data from pdf (if allowed).
     */
    if ((decode_pdf.isEncrypted()&&(!decode_pdf.isPasswordSupplied()))&& (!decode_pdf.isExtractionAllowed())) {
      System.out.println("Encrypted settings");
      System.out.println("Please look at SimpleViewer for code sample to handle such files");
    } else {
     
      ProgressMonitor status = new ProgressMonitor(currentGUI.getFrame(),
          Messages.getMessage("PdfViewerMessage.ExtractText"),"",startPage,endPage);
      /**
       * extract data from pdf
       */
      try {
        int count=0;
        boolean yesToAll = false;
        for (int page = startPage; page < endPage + 1; page++) { //read pages
          if(status.isCanceled()){
            currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +count
                + ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
            return;
          }
          //decode the page
          decode_pdf.decodePage(page);
         
          /** create a grouping object to apply grouping to data*/
          PdfGroupingAlgorithms currentGrouping =decode_pdf.getGroupingObject();
         
          /**use whole page size for  demo - get data from PageData object*/
          PdfPageData currentPageData = decode_pdf.getPdfPageData();
         
          int x1,y1,x2,y2;
         
          x1 = currentPageData.getMediaBoxX(page);
          x2 = currentPageData.getMediaBoxWidth(page)+x1;
         
          y2 = currentPageData.getMediaBoxY(page);
          y1 = currentPageData.getMediaBoxHeight(page)+y2;
         
          //default for xml
          String ending="_text.csv";
         
          if(useXMLExtraction)
            ending="_xml.txt";
         
          /**Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
         
          /**The call to extract the table*/
          Map tableContent =null;
          String tableText=null;
         
          try{
            //the source code for this grouping is in the customer area
            //in class pdfGroupingAlgorithms
            //all these settings are defined in the Java
           
            tableContent =currentGrouping.extractTextAsTable(
                x1,
                y1,
                x2,
                y2,
                page,
                !useXMLExtraction,
                false,
                false,false,0);
           
            //get the text from the Map object
            tableText=(String)tableContent.get("content");
           
          } catch (PdfException e) {
            decode_pdf.closePdfFile();
            System.err.println("Exception " + e.getMessage()+" with table extraction");
          }catch (Error e) {
            e.printStackTrace();
          }
         
          if (tableText == null) {
            System.out.println("No text found");
          } else {
           
           
            String target=output_dir+separator+"table"+separator;
           
            //create a directory if it doesn't exist
            File output_path = new File(target);
            if (output_path.exists() == false)
              output_path.mkdirs();
           
            File fileToSave = new File(target + fileName+ '_' +page+ ending);
            if(fileToSave.exists() && !yesToAll){
              if((endPage - startPage) > 1){
                        int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),true);
                       
                        if(n==0){
                          // clicked yes so just carry on for this once
                        }else if(n==1){
                          // clicked yes to all, so set flag
                          yesToAll = true;
                        }else if(n==2){
                          // clicked no, so loop round again
                          status.setProgress(page);
                          continue;
                        }else{
                         
                          currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                              count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
                         
                          status.close();
                          return;
                        }
                      }else{
                        int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),false);
                       
                        if(n==0){
                          // clicked yes so just carry on
                        }else{
                          // clicked no, so exit
                          return;
                        }
                      }
            }
           
            /**
             * output the data - you may wish to alter the encoding to suit
             */
            OutputStreamWriter output_stream =
              new OutputStreamWriter(
                  new FileOutputStream(target + fileName+ '_' +page+ ending),
              "UTF-8");
           
//            xml header
            if(useXMLExtraction)
              output_stream.write("<xml><BODY>\n\n");
           
            output_stream.write(tableText); //write actual data
           
//            xml footer
            if(useXMLExtraction)
              output_stream.write("\n</body></xml>");
           
            output_stream.close();
           
          }
          count++;
          status.setProgress(page+1);
          //remove data once written out
          decode_pdf.flushObjectValues(false);
        }
        status.close();
        currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo")+ ' ' +output_dir);
      } catch (Exception e) {
        decode_pdf.closePdfFile();
        System.err.println("Exception " + e.getMessage());
        e.printStackTrace();
      }catch(Error e){
        System.out.println("h34343");
        e.printStackTrace();
      }
     
      decode_pdf.flushObjectValues(true); //flush any text data read
     
    }
   
    /**close the pdf file*/
    decode_pdf.closePdfFile();
   
  }
 
  private void decodeTextWordlist(int startPage, int endPage, String output_dir,boolean useXMLExtraction) {
   
    PdfDecoder decode_pdf=null;
   
    //PdfDecoder returns a PdfException if there is a problem
    try {
      decode_pdf = new PdfDecoder(false);
     
      decode_pdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
      decode_pdf.init(true);
     
     
      //always reset to use unaltered co-ords - allow use of rotated or unrotated
      // co-ordinates on pages with rotation (used to be in PdfDecoder)
      PdfGroupingAlgorithms.useUnrotatedCoords=false;
     
      /**
       * open the file (and read metadata including pages in  file)
       */
      decode_pdf.openPdfFile(selectedFile);
     
    } catch (PdfSecurityException e) {
      System.err.println("Exception " + e+" in pdf code for wordlist"+selectedFile);
    } catch (PdfException e) {
      System.err.println("Exception " + e+" in pdf code for wordlist"+selectedFile);
     
    } catch (Exception e) {
      System.err.println("Exception " + e+" in pdf code for wordlist"+selectedFile);
      e.printStackTrace();
    }
   
    /**
     * extract data from pdf (if allowed).
     */
    if ((decode_pdf.isEncrypted()&&(!decode_pdf.isPasswordSupplied()))&& (!decode_pdf.isExtractionAllowed())) {
      System.out.println("Encrypted settings");
      System.out.println("Please look at SimpleViewer for code sample to handle such files");
     
    } else{
      //page range
      int start = startPage, end = endPage;
      int wordsExtracted=0;
     
      ProgressMonitor status = new ProgressMonitor(currentGUI.getFrame(),
          Messages.getMessage("PdfViewerMessage.ExtractText"),"",startPage,endPage);
     
      /**
       * extract data from pdf
       */
      try {
        int count=0;
        boolean yesToAll = false;
        for (int page = start; page < end + 1; page++) { //read pages
          if(status.isCanceled()){
            currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
            return;
          }
          //decode the page
          decode_pdf.decodePage(page);
         
          /** create a grouping object to apply grouping to data*/
          PdfGroupingAlgorithms currentGrouping =decode_pdf.getGroupingObject();
         
          /**use whole page size for  demo - get data from PageData object*/
          PdfPageData currentPageData = decode_pdf.getPdfPageData();
         
          int x1 = currentPageData.getMediaBoxX(page);
          int x2 = currentPageData.getMediaBoxWidth(page)+x1;
         
          int y2 = currentPageData.getMediaBoxX(page);
          int y1 = currentPageData.getMediaBoxHeight(page)-y2;
         
          /**Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
         
          /**The call to extract the list*/
          List words =null;
         
          try{
            words =currentGrouping.extractTextAsWordlist(
                x1,
                y1,
                x2,
                y2,
                page,
                true,"&:=()!;.,\\/\"\"\'\'");
          } catch (PdfException e) {
            decode_pdf.closePdfFile();
            System.err.println("Exception= "+ e+" in "+selectedFile);
            e.printStackTrace();
          }catch(Error e){
            e.printStackTrace();
          }
         
          if (words == null) {
           
            System.out.println("No text found");
           
          } else {
           
            String target=output_dir+separator+"wordlist"+separator;
           
            //create a directory if it doesn't exist
            File output_path = new File(target);
            if (output_path.exists() == false)
              output_path.mkdirs();
           
            /**
             * choose correct prefix
             */
            String prefix="_text.txt";
            String encoding=System.getProperty("file.encoding");
           
            if(useXMLExtraction){
              prefix="_xml.txt";
              encoding="UTF-8";
            }
           
            /**each word is stored as 5 consecutive values (word,x1,y1,x2,y2)*/
            int wordCount=words.size()/5;
           
            //update our count
            wordsExtracted=wordsExtracted+wordCount;
           
           
            File fileToSave = new File(target + fileName+ '_' +page + prefix);
            if(fileToSave.exists() && !yesToAll){
              if((endPage - startPage) > 1){
                        int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),true);
                       
                        if(n==0){
                          // clicked yes so just carry on for this once
                        }else if(n==1){
                          // clicked yes to all, so set flag
                          yesToAll = true;
                        }else if(n==2){
                          // clicked no, so loop round again
                          status.setProgress(page);
                          continue;
                        }else{
                         
                          currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                              count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
                         
                          status.close();
                          return;
                        }
                      }else{
                        int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),false);
                       
                        if(n==0){
                          // clicked yes so just carry on
                        }else{
                          // clicked no, so exit
                          return;
                        }
                      }
            }
           
           
            /**
             * output the data
             */
            OutputStreamWriter output_stream =
              new OutputStreamWriter(
                  new FileOutputStream(target + fileName+ '_' +page + prefix),
                  encoding);
           
            Iterator wordIterator=words.iterator();
            while(wordIterator.hasNext()){
             
              String currentWord=(String) wordIterator.next();
             
              /**remove the XML formatting if present - not needed for pure text*/
              if(!useXMLExtraction)
                currentWord=Strip.convertToText(currentWord,true);
             
              int wx1=(int)Float.parseFloat((String) wordIterator.next());
              int wy1=(int)Float.parseFloat((String) wordIterator.next());
              int wx2=(int)Float.parseFloat((String) wordIterator.next());
              int wy2=(int)Float.parseFloat((String) wordIterator.next());
             
              /**this could be inserting into a database instead*/
              output_stream.write(currentWord+ ',' +wx1+ ',' +wy1+ ',' +wx2+ ',' +wy2+ '\n');
             
            }
            output_stream.close();
           
          }
         
          count++;
          status.setProgress(page+1);
         
          //remove data once written out
          decode_pdf.flushObjectValues(false);
         
        }
        status.close();
        currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo")+ ' ' +output_dir);
      } catch (Exception e) {
        decode_pdf.closePdfFile();
        System.err.println("Exception "+ e+" in "+selectedFile);
        e.printStackTrace();
      }catch(Error e){
        e.printStackTrace();
     
    }
   
    /**close the pdf file*/
    decode_pdf.closePdfFile();
   
    decode_pdf=null;
   
   
  }
 
  private void decodeTextRectangle(int startPage, int endPage, String output_dir,boolean useXMLExtraction) {
   
    PdfDecoder decode_pdf=null;
   
    //PdfDecoder returns a PdfException if there is a problem
    try {
      decode_pdf = new PdfDecoder( false );
     
      if(!useXMLExtraction)
                decode_pdf.useTextExtraction();
     
      decode_pdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
      decode_pdf.init(true);
     
      /**
       * open the file (and read metadata including pages in  file)
       */
      decode_pdf.openPdfFile(selectedFile);
     
    } catch (PdfSecurityException se) {
      System.err.println("Security Exception " + se + " in pdf code for text extraction on file ");
      //e.printStackTrace();
    } catch (PdfException se) {
      System.err.println("Pdf Exception " + se + " in pdf code for text extraction on file ");
      //e.printStackTrace();
    } catch (Exception e) {
      System.err.println("Exception " + e + " in pdf code for text extraction on file ");
      e.printStackTrace();
    }
   
    /**
     * extract data from pdf (if allowed).
     */
    if ((decode_pdf.isEncrypted()&&(!decode_pdf.isPasswordSupplied()))&& (!decode_pdf.isExtractionAllowed())) {
      System.out.println("Encrypted settings");
      System.out.println("Please look at SimpleViewer for code sample to handle such files");
     
    } else {
     
      ProgressMonitor status = new ProgressMonitor(currentGUI.getFrame(),
          Messages.getMessage("PdfViewerMessage.ExtractText"),"",startPage,endPage);
     
      /**
       * extract data from pdf
       */
      try {
        int count=0;
        boolean yesToAll = false;
        for (int page = startPage; page < endPage + 1; page++) { //read pages
          if(status.isCanceled()){
            currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport")
                +count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
            return;
          }
          //decode the page
          decode_pdf.decodePage(page);
         
          /** create a grouping object to apply grouping to data*/
          PdfGroupingAlgorithms currentGrouping =decode_pdf.getGroupingObject();
         
          /**use whole page size for  demo - get data from PageData object*/
          PdfPageData currentPageData = decode_pdf.getPdfPageData();
         
          int x1 = currentPageData.getMediaBoxX(page);
          int x2 = currentPageData.getMediaBoxWidth(page)+x1;
         
          int y2 = currentPageData.getMediaBoxY(page);
          int y1 = currentPageData.getMediaBoxHeight(page)+y2;
         
          /**Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
         
          /**The call to extract the text*/
          String text =null;
         
          try{
            text =currentGrouping.extractTextInRectangle(
                x1,
                y1,
                x2,
                y2,
                page,
                false,
                true);
          } catch (PdfException e) {
            decode_pdf.closePdfFile();
            System.err.println("Exception " + e.getMessage()+" in file "+decode_pdf.getObjectStore().fullFileName);
            e.printStackTrace();
          }
         
          //allow for no text
          if(text==null)
            continue;
         
          String target=output_dir+separator+"rectangle"+separator;         
         
          //ensure a directory for data
          File page_path = new File(target);
          if (page_path.exists() == false)
            page_path.mkdirs();
         
          /**
           * choose correct prefix
           */
          String prefix="_text.txt";
          String encoding=System.getProperty("file.encoding");
         
          if(useXMLExtraction){
            prefix="_xml.txt";
            encoding="UTF-8";
          }

          File fileToSave = new File(target + fileName+ '_' +page + prefix);
          if(fileToSave.exists() && !yesToAll){
            if((endPage - startPage) > 1){
                      int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),true);
                     
                      if(n==0){
                        // clicked yes so just carry on for this once
                      }else if(n==1){
                        // clicked yes to all, so set flag
                        yesToAll = true;
                      }else if(n==2){
                        // clicked no, so loop round again
                        status.setProgress(page);
                        continue;
                      }else{
                       
                        currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") +
                            count+ ' ' +Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
                       
                        status.close();
                        return;
                      }
                    }else{
                      int n = currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(),false);
                     
                      if(n==0){
                        // clicked yes so just carry on
                      }else{
                        // clicked no, so exit
                        return;
                      }
                    }
          }

          /**
           * output the data
           */
          OutputStreamWriter output_stream =
            new OutputStreamWriter(
                new FileOutputStream(target + fileName+ '_' +page + prefix),
                encoding);
         
          if((useXMLExtraction)){
            output_stream.write(
            "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n");
            output_stream.write(
            "<!-- Pixel Location of text x1,y1,x2,y2\n");
            output_stream.write("(x1,y1 is top left corner)\n");
            output_stream.write("(x1,y1 is bottom right corner)\n");
            output_stream.write(
            "(origin is bottom left corner)  -->\n");
            output_stream.write("\n\n<ARTICLE>\n");
            output_stream.write(
                "<LOCATION x1=\""
                + x1
                + "\" "
                + "y1=\""
                + y1
                + "\" "
                + "x2=\""
                + x2
                + "\" "
                + "y2=\""
                + y2
                + "\" />\n");
            output_stream.write("\n\n<TEXT>\n");
            //NOTE DATA IS TECHNICALLY UNICODE
            output_stream.write(text); //write actual data
            output_stream.write("\n\n</TEXT>\n");
            output_stream.write("\n\n</ARTICLE>\n");
          }else
            output_stream.write(text); //write actual data
         
          count++;
          output_stream.close();
         
          status.setProgress(page+1);
         
          //remove data once written out
          decode_pdf.flushObjectValues(true);
        }
        status.close();
        currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo")+ ' ' +output_dir);
       
      } catch (Exception e) {
        decode_pdf.closePdfFile();
        System.err.println("Exception " + e.getMessage());
        e.printStackTrace();
        System.out.println(decode_pdf.getObjectStore().getCurrentFilename());
      }
    }

    /**close the pdf file*/
    decode_pdf.closePdfFile();
   
    decode_pdf=null;
  }
TOP

Related Classes of org.jpedal.examples.simpleviewer.utils.Exporter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.