Source Code of de.chris_soft.nanodoa.misc.FulltextRecognizer

/**
 * NanoDoA - File based document archive
 *
 * Copyright (C) 2011-2012 Christian Packenius, christian.packenius@googlemail.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.chris_soft.nanodoa.misc;


import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Properties;


import javax.imageio.ImageIO;


import com.google.zxing.Result;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;


import de.chris_soft.nanoarchive.Metadata;
import de.chris_soft.nanodoa.God;
import de.chris_soft.utilities.BarcodeReader;
import de.chris_soft.utilities.FileUtils;
import de.chris_soft.utilities.IdUtils;
import de.chris_soft.utilities.LogUtils;
import de.chris_soft.utilities.OcrViaTesseractUtils;
import de.chris_soft.utilities.TiffUtils;
import de.chris_soft.utilities.Utf8Utils;
import de.chris_soft.utilities.pdf.PdfUtils;


/**
 * Recognizer of full text of a document.
 * @author Christian Packenius.
 */
public class FulltextRecognizer {
  private static Utf8Utils utf8 = new Utf8Utils();


  /**
   * Returns the full text of a document.
   * @param file Document file.
   * @param metadata Properties object with (and for) metadata to the document.
   * @return Full text of the document, including OCR results, real texts and
   *         barcodes.
   * @throws DocumentReadingException
   */
  public static String getFulltext(File file, Properties metadata) throws DocumentReadingException {
    String fulltext = "";
    String ocrtext = getOcrTextAndStoreBarcodes(file, metadata);
    metadata.setProperty(Metadata.OCRTEXT, ocrtext);
    fulltext += fulltext.length() == 0 ? ocrtext : "\r\n\r\n" + ocrtext;


    String realtext = "";
    try {
      realtext = getRealText(file);
    }
    catch (IOException exception) {
      LogUtils.log(exception);
    }
    metadata.setProperty(Metadata.REALTEXT, realtext);
    fulltext += fulltext.length() == 0 ? realtext : "\r\n\r\n" + realtext;


    return fulltext;
  }


  /**
   * Get the real text from the file. Bitmaps don't have real text, but maybe
   * PDFs have.
   * @param file Document file.
   * @return Real text of the document.
   * @throws IOException
   */
  public static String getRealText(File file) throws IOException {
    if (PdfUtils.isPdfFile(file)) {
      return PdfUtils.getTextFromPdfFile(file);
    }
    return "";
  }


  /**
   * Get the OCR result of the given document.
   * @param file Document file.
   * @param metadata Properties object with (and for) metadata of the document.
   * @return OCR result as single string.
   * @throws DocumentReadingException
   */
  public static String getOcrTextAndStoreBarcodes(File file, Properties metadata) throws DocumentReadingException {
    OcrViaTesseractUtils ocr = new OcrViaTesseractUtils();
    String ext = FileUtils.getFileExtension(file).toLowerCase();
    String imageName;
    try {
      imageName = file.getCanonicalPath();
    }
    catch (IOException exception) {
      imageName = file.getAbsolutePath();
    }
    File tmpDirectory = new File("tmp");
    tmpDirectory.mkdirs();
    String ocrName;
    try {
      ocrName = new File(tmpDirectory, "" + IdUtils.getUniqueID()).getCanonicalPath();
    }
    catch (IOException exception) {
      ocrName = new File(tmpDirectory, "" + IdUtils.getUniqueID()).getAbsolutePath();
    }
    String text = "";
    if (ext.equals("pdf")) {
      text = workPdfFile(ocr, imageName, ocrName, metadata);
    }
    else if (ext.equals("tif") || ext.equals("tiff")) {
      text = workTiffFile(ocr, file, metadata);
    }
    else {
      text = workSingleImageFile(ocr, imageName, ocrName, metadata);
    }
    FileUtils.deleteFile(new File(ocrName + ".txt"));
    FileUtils.deleteFile(new File(ocrName));
    return text;
  }


  private static String workTiffFile(OcrViaTesseractUtils ocr, File tiffFile, Properties metadata)
      throws DocumentReadingException {
    List<File> list;
    try {
      list = TiffUtils.convertTiffFileToPngFiles(tiffFile, new File("tmp"));
    }
    catch (IOException exception) {
      throw new DocumentReadingException(exception);
    }
    StringBuilder text = new StringBuilder();
    StringBuilder barcodes = new StringBuilder();
    StringBuilder textBarcodes = new StringBuilder();
    while (!list.isEmpty()) {
      File pngPageFile = list.remove(0);
      String pngPageName;
      try {
        pngPageName = pngPageFile.getCanonicalPath();
      }
      catch (IOException exception) {
        pngPageName = pngPageFile.getAbsolutePath();
      }
      try {
        recognizeAndStoreOcrTextOfTiffPngFile(ocr, list, text, pngPageName);
      }
      catch (IOException exception) {
        // Ignore - single page can not be recognized.
      }
      recognizeAndStoreBarcodesOfTiffPngFile(barcodes, textBarcodes, pngPageFile);
      removeTemporaryTiffPngPageFiles(pngPageName);
    }
    metadata.setProperty(Metadata.BARCODES, barcodes.toString());
    return text.toString() + textBarcodes.toString();
  }


  private static void recognizeAndStoreOcrTextOfTiffPngFile(OcrViaTesseractUtils ocr, List<File> list,
      StringBuilder text, String pngPageName) throws IOException {
    try {
      if (ocr.toFile(pngPageName, pngPageName, 30000, 500) == 0) {
        text.append(utf8.read(pngPageName + ".txt"));
        if (!list.isEmpty()) {
          text.append("\r\n\r\n");
        }
      }
    }
    catch (InterruptedException exception) {
      // Ignore.
    }
  }


  private static void recognizeAndStoreBarcodesOfTiffPngFile(StringBuilder barcodes, StringBuilder textBarcodes,
      File pngPageFile) {
    try {
      BufferedImage image = ImageIO.read(pngPageFile);
      appendBarcodes(barcodes, textBarcodes, image);
    }
    catch (Exception exception) {
      // Ignore.
    }
  }


  private static void appendBarcodes(StringBuilder barcodes, StringBuilder textBarcodes, BufferedImage image) {
    Result[] results = BarcodeReader.getBarcodeResults(image);
    if (results != null) {
      for (Result result : results) {
        if (barcodes.length() > 0) {
          barcodes.append("\r\n\r\n");
        }
        barcodes.append(result.getBarcodeFormat().getName() + ": \r\n" + result.getText());
        textBarcodes.append("\r\n\r\n" + result.getText());
      }
    }
  }


  private static void removeTemporaryTiffPngPageFiles(String pngPageName) {
    FileUtils.deleteFile(new File(pngPageName + ".txt"));
    FileUtils.deleteFile(new File(pngPageName));
  }


  private static String workPdfFile(OcrViaTesseractUtils ocr, String pdfName, String ocrName, Properties metadata)
      throws DocumentReadingException {
    PdfReader reader;
    int xrefSize;
    try {
      reader = new PdfReader(pdfName);
      xrefSize = reader.getXrefSize();
    }
    catch (IOException exception) {
      throw new DocumentReadingException(exception);
    }
    StringBuilder pdfContent = new StringBuilder();
    StringBuilder barcodes = new StringBuilder();
    StringBuilder textBarcodes = new StringBuilder();
    for (int i = 0; i < xrefSize; i++) {
      God.appWindow.getStatusBar().setInitProgress(i, xrefSize);
      try {
        PdfObject pdfobj = reader.getPdfObject(i);
        if (pdfobj != null) {
          if (pdfobj.isStream()) {
            PdfStream stream = (PdfStream) pdfobj;
            PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
            if (pdfsubtype != null) {
              if (pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
                getImageFromPageAndAddOcrResultToContent(ocr, pdfContent, barcodes, textBarcodes, ocrName, stream);
              }
            }
          }
        }
      }
      catch (Exception e) {
        // We may have exceptions within this operations.
      }
    }
    try {
      reader.close();
    }
    catch (Exception e) {
      // Ignore.
    }
    metadata.setProperty(Metadata.BARCODES, barcodes.toString());
    return pdfContent.toString() + textBarcodes.toString();
  }


  private static void getImageFromPageAndAddOcrResultToContent(OcrViaTesseractUtils ocr, StringBuilder pdfContent,
      StringBuilder barcodes, StringBuilder textBarcodes, String ocrName, PdfStream stream) throws Exception {
    final File tmpJpegFile = new File(ocrName + ".jpg");
    writePdfImageFromStreamIntoFile(stream, tmpJpegFile);
    if (tmpJpegFile.exists()) {
      getBarcodesFromTemporaryImageFile(tmpJpegFile, barcodes, textBarcodes);
      String pdfPageContent = getOcrResultFromTemporaryImageFile(ocr, tmpJpegFile);
      if (pdfPageContent != null && pdfPageContent.trim().length() > 0) {
        if (pdfContent.length() > 0) {
          pdfContent.append("\r\n\r\n");
        }
        pdfContent.append(pdfPageContent);
      }
    }
    FileUtils.deleteFile(tmpJpegFile);
  }


  private static void getBarcodesFromTemporaryImageFile(File tmpJpegFile, StringBuilder barcodes,
      StringBuilder textBarcodes) {
    try {
      BufferedImage image = ImageIO.read(tmpJpegFile);
      appendBarcodes(barcodes, textBarcodes, image);
    }
    catch (Exception e) {
      LogUtils.log(e);
    }
  }


  private static void writePdfImageFromStreamIntoFile(PdfStream stream, final File tmpJpegFile) {
    try {
      PdfObject filterObject = stream.get(PdfName.FILTER);
      String filterName = filterObject.toString();
      if (filterName.equals("/DCTDecode")) {
        writePdfImageFromStreamViaRawImage(stream, tmpJpegFile);
      }
      else {
        writePdfImageFromStreamViaBufferedImage(stream, tmpJpegFile);
      }
    }
    catch (Exception e) {
      // Hier koennen vielfaeltige Fehler entstehen: Der Stream kann nicht
      // ausgelesen werden, was zum NullPointer fuehrt, es kann zu
      // Decodingfehlern des Images fuehren oder zu IO-Fehlern beim
      // Wegschreiben.
      // All das fuehrt schlicht zu einem einzelnen Ergebnis: Das JPEG-File
      // sollte nicht angelegt werden.
      FileUtils.deleteFile(tmpJpegFile);
    }
  }


  private static void writePdfImageFromStreamViaRawImage(PdfStream stream, final File tmpJpegFile) throws IOException {
    byte[] img = PdfReader.getStreamBytesRaw((PRStream) stream);
    if (img != null) {
      FileOutputStream out = new FileOutputStream(tmpJpegFile);
      out.write(img);
      out.flush();
      out.close();
    }
  }


  private static void writePdfImageFromStreamViaBufferedImage(PdfStream stream, final File tmpJpegFile)
      throws IOException {
    PdfImageObject image = new PdfImageObject((PRStream) stream);
    BufferedImage bufferedImage = image.getBufferedImage();
    if (bufferedImage != null) {
      FileOutputStream out = new FileOutputStream(tmpJpegFile);
      ImageIO.write(bufferedImage, "jpg", out);
      out.close();
    }
  }


  private static String getOcrResultFromTemporaryImageFile(OcrViaTesseractUtils ocr, File tmpJpegFile) throws Exception {
    String tmpName = null;
    try {
      tmpName = tmpJpegFile.getCanonicalPath();
      int ocrRC;
      if ((ocrRC = ocr.toFile(tmpName, tmpName, 30000, 500)) != 0) {
        throw new Exception("OCR (Tesseract) result of a pdf image was " + ocrRC);
      }
      return utf8.read(tmpName + ".txt");
    }
    finally {
      FileUtils.deleteFile(tmpJpegFile);
      if (tmpName != null) {
        FileUtils.deleteFile(new File(tmpName + ".txt"));
      }
    }
  }


  private static String workSingleImageFile(OcrViaTesseractUtils ocr, String imageName, String ocrName,
      Properties metadata) throws DocumentReadingException {
    try {
      BufferedImage image = ImageIO.read(new File(imageName));
      StringBuilder barcodes = new StringBuilder();
      StringBuilder textBarcodes = new StringBuilder();
      metadata.setProperty(Metadata.BARCODES, barcodes.toString());
      appendBarcodes(barcodes, textBarcodes, image);
      if (ocr.toFile(imageName, ocrName, 30000, 500) == 0) {
        String ocrTextFileName = ocrName + ".txt";
        return utf8.read(ocrTextFileName) + textBarcodes.toString();
      }
      return textBarcodes.toString();
    }
    catch (Exception e) {
      throw new DocumentReadingException(e);
    }
  }
}
Source Code of de.chris_soft.nanodoa.misc.FulltextRecognizer

Related Classes of de.chris_soft.nanodoa.misc.FulltextRecognizer