* NanoDoA - File based document archive
* Copyright (C) 2011-2012 Christian Packenius, christian.packenius@googlemail.com
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
package de.chris_soft.nanodoa;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Properties;
import javax.imageio.ImageIO;
import com.google.zxing.Result;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import de.chris_soft.nanoarchive.Metadata;
import de.chris_soft.utilities.BarcodeReader;
import de.chris_soft.utilities.FileUtils;
import de.chris_soft.utilities.IdUtils;
import de.chris_soft.utilities.LogUtils;
import de.chris_soft.utilities.OcrViaTesseractUtils;
import de.chris_soft.utilities.PdfUtils;
import de.chris_soft.utilities.TiffUtils;
import de.chris_soft.utilities.Utf8Utils;
* Recognizer of full text of a document.
* @author Christian Packenius.
public class FulltextRecognizer {
private static Utf8Utils utf8 = new Utf8Utils();
* Returns the full text of a document.
* @param file Document file.
* @param metadata Properties object with (and for) metadata to the document.
* @return Full text of the document, including OCR results, real texts and
* barcodes.
public static String getFulltext(File file, Properties metadata) {
String fulltext = "";
try {
String ocrtext = getOcrTextAndStoreBarcodes(file, metadata);
metadata.setProperty(Metadata.OCRTEXT, ocrtext);
fulltext += fulltext.length() == 0 ? ocrtext : "\r\n\r\n" + ocrtext;
catch (Exception e) {
try {
String realtext = getRealText(file);
metadata.setProperty(Metadata.REALTEXT, realtext);
fulltext += fulltext.length() == 0 ? realtext : "\r\n\r\n" + realtext;
catch (IOException e) {
return fulltext;
* Get the real text from the file. Bitmaps don't have real text, but maybe
* PDFs have.
* @param file Document file.
* @return Real text of the document.
* @throws IOException
public static String getRealText(File file) throws IOException {
if (PdfUtils.isPdfFile(file)) {
return PdfUtils.getTextFromPdfFile(file);
return "";
* Get the OCR result of the given document.
* @param file Document file.
* @param metadata Properties object with (and for) metadata of the document.
* @return OCR result as single string.
* @throws Exception
public static String getOcrTextAndStoreBarcodes(File file, Properties metadata) throws Exception {
OcrViaTesseractUtils ocr = new OcrViaTesseractUtils();
String ext = FileUtils.getFileExtension(file).toLowerCase();
String imageName = file.getCanonicalPath();
File tmpDirectory = new File("tmp");
String ocrName = new File(tmpDirectory, "" + IdUtils.getUniqueID()).getCanonicalPath();
String text = "";
if (ext.equals("pdf")) {
text = workPdfFile(ocr, imageName, ocrName, metadata);
else if (ext.equals("tif") || ext.equals("tiff")) {
text = workTiffFile(ocr, file, metadata);
else {
text = workSingleImageFile(ocr, imageName, ocrName, metadata);
FileUtils.deleteFile(new File(ocrName + ".txt"));
FileUtils.deleteFile(new File(ocrName));
return text;
private static String workTiffFile(OcrViaTesseractUtils ocr, File tiffFile, Properties metadata) throws IOException {
List<File> list = TiffUtils.convertTiffFileToPngFiles(tiffFile, new File("tmp"));
StringBuilder text = new StringBuilder();
StringBuilder barcodes = new StringBuilder();
StringBuilder textBarcodes = new StringBuilder();
while (!list.isEmpty()) {
File pngPageFile = list.remove(0);
String pngPageName = pngPageFile.getCanonicalPath();
recognizeAndStoreOcrTextOfTiffPngFile(ocr, list, text, pngPageName);
recognizeAndStoreBarcodesOfTiffPngFile(barcodes, textBarcodes, pngPageFile);
metadata.setProperty(Metadata.BARCODES, barcodes.toString());
return text.toString() + textBarcodes.toString();
private static void recognizeAndStoreOcrTextOfTiffPngFile(OcrViaTesseractUtils ocr, List<File> list,
StringBuilder text, String pngPageName) throws IOException {
try {
if (ocr.toFile(pngPageName, pngPageName) == 0) {
text.append(utf8.read(pngPageName + ".txt"));
if (!list.isEmpty()) {
catch (InterruptedException exception) {
// Ignore.
private static void recognizeAndStoreBarcodesOfTiffPngFile(StringBuilder barcodes, StringBuilder textBarcodes,
File pngPageFile) {
try {
BufferedImage image = ImageIO.read(pngPageFile);
appendBarcodes(barcodes, textBarcodes, image);
catch (Exception exception) {
// Ignore.
private static void appendBarcodes(StringBuilder barcodes, StringBuilder textBarcodes, BufferedImage image) {
Result[] results = BarcodeReader.getBarcodeResults(image);
if (results != null) {
for (Result result : results) {
if (barcodes.length() > 0) {
barcodes.append(result.getBarcodeFormat().getName() + ": \r\n" + result.getText());
textBarcodes.append("\r\n\r\n" + result.getText());
private static void removeTemporaryTiffPngPageFiles(String pngPageName) {
FileUtils.deleteFile(new File(pngPageName + ".txt"));
FileUtils.deleteFile(new File(pngPageName));
private static String workPdfFile(OcrViaTesseractUtils ocr, String pdfName, String ocrName, Properties metadata)
throws Exception {
PdfReader reader = new PdfReader(pdfName);
StringBuilder pdfContent = new StringBuilder();
StringBuilder barcodes = new StringBuilder();
StringBuilder textBarcodes = new StringBuilder();
for (int i = 0; i < reader.getXrefSize(); i++) {
PdfObject pdfobj = reader.getPdfObject(i);
if (pdfobj != null) {
if (pdfobj.isStream()) {
PdfStream stream = (PdfStream) pdfobj;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
if (pdfsubtype != null) {
if (pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
System.out.println("OCR on PDF object " + i);
getImageFromPageAndAddOcrResultToContent(ocr, pdfContent, barcodes, textBarcodes, ocrName, stream);
metadata.setProperty(Metadata.BARCODES, barcodes.toString());
return pdfContent.toString() + textBarcodes.toString();
private static void getImageFromPageAndAddOcrResultToContent(OcrViaTesseractUtils ocr, StringBuilder pdfContent,
StringBuilder barcodes, StringBuilder textBarcodes, String ocrName, PdfStream stream) throws Exception {
final File tmpJpegFile = new File(ocrName + ".jpg");
writePdfImageFromStreamIntoFile(stream, tmpJpegFile);
if (tmpJpegFile.exists()) {
getBarcodesFromTemporaryImageFile(tmpJpegFile, barcodes, textBarcodes);
String pdfPageContent = getOcrResultFromTemporaryImageFile(ocr, tmpJpegFile);
if (pdfPageContent != null && pdfPageContent.trim().length() > 0) {
if (pdfContent.length() > 0) {
private static void getBarcodesFromTemporaryImageFile(File tmpJpegFile, StringBuilder barcodes,
StringBuilder textBarcodes) {
try {
BufferedImage image = ImageIO.read(tmpJpegFile);
appendBarcodes(barcodes, textBarcodes, image);
catch (Exception e) {
private static void writePdfImageFromStreamIntoFile(PdfStream stream, final File tmpJpegFile) {
try {
PdfObject filterObject = stream.get(PdfName.FILTER);
String filterName = filterObject.toString();
if (filterName.equals("/DCTDecode")) {
writePdfImageFromStreamViaRawImage(stream, tmpJpegFile);
else {
writePdfImageFromStreamViaBufferedImage(stream, tmpJpegFile);
catch (Exception e) {
// Hier k�nnen vielf�ltige Fehler entstehen: Der Stream kann nicht
// ausgelesen werden, was zum NullPointer f�hrt, es kann zu
// Decodingfehlern des Images f�hren oder zu IO-Fehlern beim Wegschreiben.
// All das f�hrt schlicht zu einem einzelnen Ergebnis: Das JPEG-File
// sollte nicht angelegt werden.
private static void writePdfImageFromStreamViaRawImage(PdfStream stream, final File tmpJpegFile) throws IOException {
byte[] img = PdfReader.getStreamBytesRaw((PRStream) stream);
if (img != null) {
FileOutputStream out = new FileOutputStream(tmpJpegFile);
private static void writePdfImageFromStreamViaBufferedImage(PdfStream stream, final File tmpJpegFile)
throws IOException {
PdfImageObject image = new PdfImageObject((PRStream) stream);
BufferedImage bufferedImage = image.getBufferedImage();
if (bufferedImage != null) {
FileOutputStream out = new FileOutputStream(tmpJpegFile);
ImageIO.write(bufferedImage, "jpg", out);
private static String getOcrResultFromTemporaryImageFile(OcrViaTesseractUtils ocr, File tmpJpegFile)
throws IOException, InterruptedException, Exception {
String tmpName = tmpJpegFile.getCanonicalPath();
int ocrRC;
if ((ocrRC = ocr.toFile(tmpName, tmpName)) != 0) {
throw new Exception("OCR (Tesseract) result of a pdf image was " + ocrRC);
String pdfPageContent = utf8.read(tmpName + ".txt");
FileUtils.deleteFile(new File(tmpName + ".txt"));
return pdfPageContent;
private static String workSingleImageFile(OcrViaTesseractUtils ocr, String imageName, String ocrName,
Properties metadata) throws IOException, InterruptedException {
BufferedImage image = ImageIO.read(new File(imageName));
StringBuilder barcodes = new StringBuilder();
StringBuilder textBarcodes = new StringBuilder();
metadata.setProperty(Metadata.BARCODES, barcodes.toString());
appendBarcodes(barcodes, textBarcodes, image);
if (ocr.toFile(imageName, ocrName) == 0) {
return utf8.read(ocrName + ".txt") + textBarcodes.toString();
return textBarcodes.toString();