Source Code of de.chris_soft.nanodoa.FulltextRecognizer

/**
 * NanoDoA - File based document archive
 *
 * Copyright (C) 2011-2012 Christian Packenius, christian.packenius@googlemail.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.chris_soft.nanodoa;


import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Properties;


import javax.imageio.ImageIO;


import com.google.zxing.Result;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;


import de.chris_soft.nanoarchive.Metadata;
import de.chris_soft.utilities.BarcodeReader;
import de.chris_soft.utilities.FileUtils;
import de.chris_soft.utilities.IdUtils;
import de.chris_soft.utilities.LogUtils;
import de.chris_soft.utilities.OcrViaTesseractUtils;
import de.chris_soft.utilities.PdfUtils;
import de.chris_soft.utilities.TiffUtils;
import de.chris_soft.utilities.Utf8Utils;


/**
 * Recognizer of full text of a document.
 * @author Christian Packenius.
 */
public class FulltextRecognizer {
  private static Utf8Utils utf8 = new Utf8Utils();


  /**
   * Returns the full text of a document.
   * @param file Document file.
   * @param metadata Properties object with (and for) metadata to the document.
   * @return Full text of the document, including OCR results, real texts and
   *         barcodes.
   */
  public static String getFulltext(File file, Properties metadata) {
    String fulltext = "";
    try {
      String ocrtext = getOcrTextAndStoreBarcodes(file, metadata);
      metadata.setProperty(Metadata.OCRTEXT, ocrtext);
      fulltext += fulltext.length() == 0 ? ocrtext : "\r\n\r\n" + ocrtext;
    }
    catch (Exception e) {
      LogUtils.log(e);
    }


    try {
      String realtext = getRealText(file);
      metadata.setProperty(Metadata.REALTEXT, realtext);
      fulltext += fulltext.length() == 0 ? realtext : "\r\n\r\n" + realtext;
    }
    catch (IOException e) {
      LogUtils.log(e);
    }


    return fulltext;
  }


  /**
   * Get the real text from the file. Bitmaps don't have real text, but maybe
   * PDFs have.
   * @param file Document file.
   * @return Real text of the document.
   * @throws IOException
   */
  public static String getRealText(File file) throws IOException {
    if (PdfUtils.isPdfFile(file)) {
      return PdfUtils.getTextFromPdfFile(file);
    }
    return "";
  }


  /**
   * Get the OCR result of the given document.
   * @param file Document file.
   * @param metadata Properties object with (and for) metadata of the document.
   * @return OCR result as single string.
   * @throws Exception
   */
  public static String getOcrTextAndStoreBarcodes(File file, Properties metadata) throws Exception {
    OcrViaTesseractUtils ocr = new OcrViaTesseractUtils();
    String ext = FileUtils.getFileExtension(file).toLowerCase();
    String imageName = file.getCanonicalPath();
    File tmpDirectory = new File("tmp");
    tmpDirectory.mkdirs();
    String ocrName = new File(tmpDirectory, "" + IdUtils.getUniqueID()).getCanonicalPath();
    String text = "";
    if (ext.equals("pdf")) {
      text = workPdfFile(ocr, imageName, ocrName, metadata);
    }
    else if (ext.equals("tif") || ext.equals("tiff")) {
      text = workTiffFile(ocr, file, metadata);
    }
    else {
      text = workSingleImageFile(ocr, imageName, ocrName, metadata);
    }
    FileUtils.deleteFile(new File(ocrName + ".txt"));
    FileUtils.deleteFile(new File(ocrName));
    return text;
  }


  private static String workTiffFile(OcrViaTesseractUtils ocr, File tiffFile, Properties metadata) throws IOException {
    List<File> list = TiffUtils.convertTiffFileToPngFiles(tiffFile, new File("tmp"));
    StringBuilder text = new StringBuilder();
    StringBuilder barcodes = new StringBuilder();
    StringBuilder textBarcodes = new StringBuilder();
    while (!list.isEmpty()) {
      File pngPageFile = list.remove(0);
      String pngPageName = pngPageFile.getCanonicalPath();
      recognizeAndStoreOcrTextOfTiffPngFile(ocr, list, text, pngPageName);
      recognizeAndStoreBarcodesOfTiffPngFile(barcodes, textBarcodes, pngPageFile);
      removeTemporaryTiffPngPageFiles(pngPageName);
    }
    metadata.setProperty(Metadata.BARCODES, barcodes.toString());
    return text.toString() + textBarcodes.toString();
  }


  private static void recognizeAndStoreOcrTextOfTiffPngFile(OcrViaTesseractUtils ocr, List<File> list,
      StringBuilder text, String pngPageName) throws IOException {
    try {
      if (ocr.toFile(pngPageName, pngPageName) == 0) {
        text.append(utf8.read(pngPageName + ".txt"));
        if (!list.isEmpty()) {
          text.append("\r\n\r\n");
        }
      }
    }
    catch (InterruptedException exception) {
      // Ignore.
    }
  }


  private static void recognizeAndStoreBarcodesOfTiffPngFile(StringBuilder barcodes, StringBuilder textBarcodes,
      File pngPageFile) {
    try {
      BufferedImage image = ImageIO.read(pngPageFile);
      appendBarcodes(barcodes, textBarcodes, image);
    }
    catch (Exception exception) {
      // Ignore.
    }
  }


  private static void appendBarcodes(StringBuilder barcodes, StringBuilder textBarcodes, BufferedImage image) {
    Result[] results = BarcodeReader.getBarcodeResults(image);
    if (results != null) {
      for (Result result : results) {
        if (barcodes.length() > 0) {
          barcodes.append("\r\n\r\n");
        }
        barcodes.append(result.getBarcodeFormat().getName() + ": \r\n" + result.getText());
        textBarcodes.append("\r\n\r\n" + result.getText());
      }
    }
  }


  private static void removeTemporaryTiffPngPageFiles(String pngPageName) {
    FileUtils.deleteFile(new File(pngPageName + ".txt"));
    FileUtils.deleteFile(new File(pngPageName));
  }


  private static String workPdfFile(OcrViaTesseractUtils ocr, String pdfName, String ocrName, Properties metadata)
      throws Exception {
    PdfReader reader = new PdfReader(pdfName);
    StringBuilder pdfContent = new StringBuilder();
    StringBuilder barcodes = new StringBuilder();
    StringBuilder textBarcodes = new StringBuilder();
    for (int i = 0; i < reader.getXrefSize(); i++) {
      PdfObject pdfobj = reader.getPdfObject(i);
      if (pdfobj != null) {
        if (pdfobj.isStream()) {
          PdfStream stream = (PdfStream) pdfobj;
          PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
          if (pdfsubtype != null) {
            if (pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
              System.out.println("OCR on PDF object " + i);
              getImageFromPageAndAddOcrResultToContent(ocr, pdfContent, barcodes, textBarcodes, ocrName, stream);
            }
          }
        }
      }
    }
    reader.close();
    metadata.setProperty(Metadata.BARCODES, barcodes.toString());
    return pdfContent.toString() + textBarcodes.toString();
  }


  private static void getImageFromPageAndAddOcrResultToContent(OcrViaTesseractUtils ocr, StringBuilder pdfContent,
      StringBuilder barcodes, StringBuilder textBarcodes, String ocrName, PdfStream stream) throws Exception {
    final File tmpJpegFile = new File(ocrName + ".jpg");
    writePdfImageFromStreamIntoFile(stream, tmpJpegFile);
    if (tmpJpegFile.exists()) {
      getBarcodesFromTemporaryImageFile(tmpJpegFile, barcodes, textBarcodes);
      String pdfPageContent = getOcrResultFromTemporaryImageFile(ocr, tmpJpegFile);
      if (pdfPageContent != null && pdfPageContent.trim().length() > 0) {
        if (pdfContent.length() > 0) {
          pdfContent.append("\r\n\r\n");
        }
        pdfContent.append(pdfPageContent);
      }
    }
    FileUtils.deleteFile(tmpJpegFile);
  }


  private static void getBarcodesFromTemporaryImageFile(File tmpJpegFile, StringBuilder barcodes,
      StringBuilder textBarcodes) {
    try {
      BufferedImage image = ImageIO.read(tmpJpegFile);
      appendBarcodes(barcodes, textBarcodes, image);
    }
    catch (Exception e) {
      LogUtils.log(e);
    }
  }


  private static void writePdfImageFromStreamIntoFile(PdfStream stream, final File tmpJpegFile) {
    try {
      PdfObject filterObject = stream.get(PdfName.FILTER);
      String filterName = filterObject.toString();
      if (filterName.equals("/DCTDecode")) {
        writePdfImageFromStreamViaRawImage(stream, tmpJpegFile);
      }
      else {
        writePdfImageFromStreamViaBufferedImage(stream, tmpJpegFile);
      }
    }
    catch (Exception e) {
      // Hier k�nnen vielf�ltige Fehler entstehen: Der Stream kann nicht
      // ausgelesen werden, was zum NullPointer f�hrt, es kann zu
      // Decodingfehlern des Images f�hren oder zu IO-Fehlern beim Wegschreiben.
      // All das f�hrt schlicht zu einem einzelnen Ergebnis: Das JPEG-File
      // sollte nicht angelegt werden.
      FileUtils.deleteFile(tmpJpegFile);
    }
  }


  private static void writePdfImageFromStreamViaRawImage(PdfStream stream, final File tmpJpegFile) throws IOException {
    byte[] img = PdfReader.getStreamBytesRaw((PRStream) stream);
    if (img != null) {
      FileOutputStream out = new FileOutputStream(tmpJpegFile);
      out.write(img);
      out.flush();
      out.close();
    }
  }


  private static void writePdfImageFromStreamViaBufferedImage(PdfStream stream, final File tmpJpegFile)
      throws IOException {
    PdfImageObject image = new PdfImageObject((PRStream) stream);
    BufferedImage bufferedImage = image.getBufferedImage();
    if (bufferedImage != null) {
      FileOutputStream out = new FileOutputStream(tmpJpegFile);
      ImageIO.write(bufferedImage, "jpg", out);
      out.close();
    }
  }


  private static String getOcrResultFromTemporaryImageFile(OcrViaTesseractUtils ocr, File tmpJpegFile)
      throws IOException, InterruptedException, Exception {
    String tmpName = tmpJpegFile.getCanonicalPath();
    int ocrRC;
    if ((ocrRC = ocr.toFile(tmpName, tmpName)) != 0) {
      throw new Exception("OCR (Tesseract) result of a pdf image was " + ocrRC);
    }
    String pdfPageContent = utf8.read(tmpName + ".txt");
    FileUtils.deleteFile(tmpJpegFile);
    FileUtils.deleteFile(new File(tmpName + ".txt"));
    return pdfPageContent;
  }


  private static String workSingleImageFile(OcrViaTesseractUtils ocr, String imageName, String ocrName,
      Properties metadata) throws IOException, InterruptedException {
    BufferedImage image = ImageIO.read(new File(imageName));
    StringBuilder barcodes = new StringBuilder();
    StringBuilder textBarcodes = new StringBuilder();
    metadata.setProperty(Metadata.BARCODES, barcodes.toString());
    appendBarcodes(barcodes, textBarcodes, image);
    if (ocr.toFile(imageName, ocrName) == 0) {
      return utf8.read(ocrName + ".txt") + textBarcodes.toString();
    }
    return textBarcodes.toString();
  }
}
Source Code of de.chris_soft.nanodoa.FulltextRecognizer

Related Classes of de.chris_soft.nanodoa.FulltextRecognizer