Source Code of org.apache.tika.parser.ocr.TesseractOCRParser

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ocr;


import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;


import javax.imageio.ImageIO;


import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.PSDParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;


/**
 * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
 * create a {@link TesseractOCRConfig} object and pass it through a
 * ParseContext. Tesseract-ocr must be installed and on system path or the path
 * to its root folder must be provided:
 * <p>
 * TesseractOCRConfig config = new TesseractOCRConfig();<br>
 * //Needed if tesseract is not on system path<br>
 * config.setTesseractPath(tesseractFolder);<br>
 * parseContext.set(TesseractOCRConfig.class, config);<br>
 * </p>
 * 
 * 
 */
public class TesseractOCRParser extends AbstractParser {


  private static final long serialVersionUID = 1L;


  private static final Set<MediaType> SUPPORTED_TYPES = getTypes();


  private static Set<MediaType> getTypes() {
    HashSet<MediaType> supportedTypes = new HashSet<MediaType>();


    supportedTypes.add(MediaType.image("png"));
    supportedTypes.add(MediaType.image("jpeg"));
    supportedTypes.add(MediaType.image("tiff"));
    supportedTypes.add(MediaType.image("x-ms-bmp"));
    supportedTypes.add(MediaType.image("gif"));


    return supportedTypes;
  }


  @Override
  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
    return SUPPORTED_TYPES;
  }


  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
    if (!config.getTesseractPath().isEmpty()) {
      Map<String, String> env = pb.environment();
      env.put("TESSDATA_PREFIX", config.getTesseractPath());
    }
  }


  public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
      SAXException, TikaException {


    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
      int w = image.getWidth(null);
      int h = image.getHeight(null);
      BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
      Graphics2D g2 = bImage.createGraphics();
      g2.drawImage(image, 0, 0, null);
      g2.dispose();
      File file = tmp.createTemporaryFile();
      fos = new FileOutputStream(file);
      ImageIO.write(bImage, "png", fos);
      bImage = null;
      tis = TikaInputStream.get(file);
      parse(tis, handler, metadata, context);


    } finally {
      tmp.dispose();
      if (tis != null)
        tis.close();
      if (fos != null)
        fos.close();
    }


  }


  @Override
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {


    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
    if (config == null)
      config = new TesseractOCRConfig();


    String[] checkCmd = { config.getTesseractPath() + getTesseractProg() };
    // If Tesseract is not on the path, do not try to run OCR.
    if (!ExternalParser.check(checkCmd))
      return;


    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);


    TemporaryResources tmp = new TemporaryResources();
    File output = null;
    try {
      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
      File input = tikaStream.getFile();
      long size = tikaStream.getLength();


      if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {


        output = tmp.createTemporaryFile();
        doOCR(input, output, config);


        // Tesseract appends .txt to output file name
        output = new File(output.getAbsolutePath() + ".txt");


        if (output.exists())
          extractOutput(new FileInputStream(output), xhtml);


      }


    } finally {
      tmp.dispose();
      if (output != null)
        output.delete();


    }
  }


  /**
   * Run external tesseract-ocr process.
   * 
   * @param input
   *          File to be ocred
   * @param output
   *          File to collect ocr result
   * @param config
   *          Configuration of tesseract-ocr engine
   * @throws TikaException
   *           if the extraction timed out
   * @throws IOException
   *           if an input error occurred
   */
  private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
    String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
        config.getLanguage(), "-psm", config.getPageSegMode() };


    ProcessBuilder pb = new ProcessBuilder(cmd);
    setEnv(config, pb);
    final Process process = pb.start();


    process.getOutputStream().close();
    InputStream out = process.getInputStream();
    InputStream err = process.getErrorStream();


    logStream("OCR MSG", out, input);
    logStream("OCR ERROR", err, input);


    FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
      public Integer call() throws Exception {
        return process.waitFor();
      }
    });


    Thread waitThread = new Thread(waitTask);
    waitThread.start();


    try {
      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);


    } catch (InterruptedException e) {
      waitThread.interrupt();
      process.destroy();
      Thread.currentThread().interrupt();
      throw new TikaException("TesseractOCRParser interrupted", e);


    } catch (ExecutionException e) {
      // should not be thrown


    } catch (TimeoutException e) {
      waitThread.interrupt();
      process.destroy();
      throw new TikaException("TesseractOCRParser timeout", e);
    }


  }


  /**
   * Reads the contents of the given stream and write it to the given XHTML
   * content handler. The stream is closed once fully processed.
   * 
   * @param stream
   *          Stream where is the result of ocr
   * @param xhtml
   *          XHTML content handler
   * @throws SAXException
   *           if the XHTML SAX events could not be handled
   * @throws IOException
   *           if an input error occurred
   */
  private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {


    Reader reader = new InputStreamReader(stream, "UTF-8");
    xhtml.startDocument();
    xhtml.startElement("div");
    try {
      char[] buffer = new char[1024];
      for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
        if (n > 0)
          xhtml.characters(buffer, 0, n);
      }
    } finally {
      reader.close();
    }
    xhtml.endElement("div");
    xhtml.endDocument();
  }


  /**
   * Starts a thread that reads the contents of the standard output or error
   * stream of the given process to not block the process. The stream is closed
   * once fully processed.
   */
  private void logStream(final String logType, final InputStream stream, final File file) {
    new Thread() {
      public void run() {
        Reader reader = new InputStreamReader(stream);
        StringBuilder out = new StringBuilder();
        char[] buffer = new char[1024];
        try {
          for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
            out.append(buffer, 0, n);
        } catch (IOException e) {


        } finally {
          IOUtils.closeQuietly(stream);
        }


        String msg = out.toString();
        // log or discard message?


      }
    }.start();
  }
  
  static String getTesseractProg() {
    return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
  }


}
Source Code of org.apache.tika.parser.ocr.TesseractOCRParser

Related Classes of org.apache.tika.parser.ocr.TesseractOCRParser