Package org.apache.uima.tika

Source Code of org.apache.uima.tika.MarkupAnnotator

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.tika;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Iterator;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.Type;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;


/** Uses TIKA to convert original markup into UIMA annotations**/
public class MarkupAnnotator extends CasAnnotator_ImplBase {


  private final static String ORIGINAL_VIEW_PARAM_NAME = "ORIGINAL_VIEW_PARAM_NAME";
  private final static String TEXT_VIEW_PARAM_NAME = "TEXT_VIEW_PARAM_NAME";
  private final static String SET_TEXT_VIEW_DEFAULT_PARAM_NAME = "SET_TEXT_VIEW_DEFAULT_PARAM_NAME";
 
  private final static String tika_file_param = "tikaConfigFile";
 
  // takes an option indicating the name of the view containing the binary document
  private String originalViewName = "_InitialView";
 
  // takes an option indicating the name of the view containing the text version of the document
  private String textViewName = "textView";
 
  // whether to make the text view default or not
  private Boolean makeTextDefaultView = true;
 
  // configuration for TIKA - can be created by specifying a custom resource
  private TikaConfig config = null;
 
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    // Get config param setting
    originalViewName  = (String) aContext.getConfigParameterValue(ORIGINAL_VIEW_PARAM_NAME);

    textViewName = (String) aContext.getConfigParameterValue(TEXT_VIEW_PARAM_NAME);
    if (textViewName==null) {
      System.err.println("Parameter TEXT_VIEW_PARAM_NAME is null; setting to \"textView\"");
      textViewName = "textView";
    }
    else System.err.println("Parameter TEXT_VIEW_PARAM_NAME is "+textViewName);
   
    makeTextDefaultView = (Boolean) aContext.getConfigParameterValue(SET_TEXT_VIEW_DEFAULT_PARAM_NAME);
    if (makeTextDefaultView==null) {
      System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is null; setting to \"true\"");
      makeTextDefaultView = new Boolean(true);
    }
    else System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is "+makeTextDefaultView);
   
    // initialise TIKA parser
    // try to get a custom config
    URL tikaConfigURL = null;
    try {
      tikaConfigURL = getContext().getResourceURL(tika_file_param);
      config = new TikaConfig(tikaConfigURL);
    } catch (Exception e1) {
      // to log
      System.err.println("Failed to load TIKA config file from "+tikaConfigURL);
      config = null;
    }

    // if not rely on default one
    if (config==null){
      try {
        config = TikaConfig.getDefaultConfig();
      } catch (TikaException e) {
        throw new ResourceInitializationException(e);
      }
    }
   
  }
 
  public void process(CAS cas) throws AnalysisEngineProcessException {
      CAS originalCas = null;
      try {
      originalCas = cas.getView(originalViewName);
      }
      catch (Exception e){
        String viewName = cas.getViewName();
        // can't find originalViewName
        System.err.println("can't find view "+originalViewName+" using "+viewName+" instead");
        originalCas = cas.getCurrentView();
      }
     
      InputStream originalStream = originalCas.getSofa().getSofaDataStream();
   
      String lang = null;
     
      // parsing with TIKA
     
      // TODO if content type is known then we use it
      // otherwise we guess
     
      Parser parser = new AutoDetectParser(config);

      Metadata md = new Metadata();
      MarkupHandler handler  = new MarkupHandler();     

      try {
        parser.parse(originalStream,handler , md);
      }
      catch (Exception e){
        // if we have a problem just dump the message and continue
        System.err.println("Problem converting file : "+e.getMessage());
        // PROBLEM => trying to serialize binary content in XML crash!
        return;
      }
      finally {
        try {
        originalStream.close();
      } catch (IOException e) {
      }
      }
     
      CAS plainTextView = cas.createView(textViewName);
     

      handler.populateCAS(plainTextView);
      plainTextView.setDocumentLanguage(lang);
     
      // get additional metadata about the document
      // e.g content type etc...
      // TODO add possibility to define type as parameter and discover
      // feature names on the fly
      JCas ptv=null;
    try {
      ptv = plainTextView.getJCas();
    } catch (CASException e) {
      e.printStackTrace();
      return;
    }
     
      Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.SourceDocumentAnnotation");
      Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
      SourceDocumentAnnotation docAnnotation = null;
      // do we already have one?
      if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
      // otherwise let's create a new annotation
      else docAnnotation = new SourceDocumentAnnotation(ptv);
     
      // now iterate on the metadata found by Tika and add them to the info
      if (docAnnotation.getFeatures()==null)
        docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size())) ;
     
      for (int i=0;i<md.size();i++){
        String name = md.names()[i];
        String value = md.get(name);
        FeatureValue fv = new FeatureValue(ptv);
        fv.setName(name);
        fv.setValue(value);
        docAnnotation.setFeatures(i,fv);
      }
      docAnnotation.addToIndexes();
    
  }

}
TOP

Related Classes of org.apache.uima.tika.MarkupAnnotator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.