Package org.apache.solr.handler.dataimport

Source Code of org.apache.solr.handler.dataimport.TikaEntityProcessor

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.dataimport;

import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;

import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow;
import static org.apache.solr.handler.dataimport.DataImporter.COLUMN;
import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL;
/**
* <p>An implementation of {@link EntityProcessor} which reads data from rich docs
* using <a href="http://tika.apache.org/">Apache Tika</a>
*
* @version $Id: TikaEntityProcessor.java 1072233 2011-02-19 01:55:16Z hossman $
* @since solr 3.1
*/
public class TikaEntityProcessor extends EntityProcessorBase {
  private TikaConfig tikaConfig;
  private static final Logger LOG = LoggerFactory.getLogger(TikaEntityProcessor.class);
  private String format = "text";
  private boolean done = false;
  private String parser;
  static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";


  @Override
  protected void firstInit(Context context) {
    try {
      String tikaConfigFile = context.getResolvedEntityAttribute("tikaConfig");
      if (tikaConfigFile == null) {
        ClassLoader classLoader = context.getSolrCore().getResourceLoader().getClassLoader();
        tikaConfig = new TikaConfig(classLoader);
      } else {
        File configFile = new File(tikaConfigFile);
        if (!configFile.isAbsolute()) {
          configFile = new File(context.getSolrCore().getResourceLoader().getConfigDir(), tikaConfigFile);
        }
        tikaConfig = new TikaConfig(configFile);
      }
    } catch (Exception e) {
      wrapAndThrow (SEVERE, e,"Unable to load Tika Config");
    }

    format = context.getResolvedEntityAttribute("format");
    if(format == null)
      format = "text";
    if (!"html".equals(format) && !"xml".equals(format) && !"text".equals(format)&& !"none".equals(format) )
      throw new DataImportHandlerException(SEVERE, "'format' can be one of text|html|xml|none");
    parser = context.getResolvedEntityAttribute("parser");
    if(parser == null) {
      parser = AUTO_PARSER;
    }
    done = false;
  }

  @Override
  public Map<String, Object> nextRow() {
    if(done) return null;
    Map<String, Object> row = new HashMap<String, Object>();
    DataSource<InputStream> dataSource = context.getDataSource();
    InputStream is = dataSource.getData(context.getResolvedEntityAttribute(URL));
    ContentHandler contentHandler = null;
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    try {
      if ("html".equals(format)) {
        contentHandler = getHtmlHandler(sw);
      } else if ("xml".equals(format)) {
        contentHandler = getXmlContentHandler(sw);
      } else if ("text".equals(format)) {
        contentHandler = getTextContentHandler(sw);
      } else if("none".equals(format)){
        contentHandler = new DefaultHandler();       
      }
    } catch (TransformerConfigurationException e) {
      wrapAndThrow(SEVERE, e, "Unable to create content handler");
    }
    Parser tikaParser = null;
    if(parser.equals(AUTO_PARSER)){
      AutoDetectParser parser = new AutoDetectParser();
      parser.setConfig(tikaConfig);
      tikaParser = parser;
    } else {
      tikaParser = (Parser) context.getSolrCore().getResourceLoader().newInstance(parser);
    }
    try {
      tikaParser.parse(is, contentHandler, metadata , new ParseContext());
    } catch (Exception e) {
      wrapAndThrow(SEVERE, e, "Unable to read content");
    }
    IOUtils.closeQuietly(is);
    for (Map<String, String> field : context.getAllEntityFields()) {
      if (!"true".equals(field.get("meta"))) continue;
      String col = field.get(COLUMN);
      String s = metadata.get(col);
      if (s != null) row.put(col, s);
    }
    if(!"none".equals(format) ) row.put("text", sw.toString());
    done = true;
    return row;
  }

  private static ContentHandler getHtmlHandler(Writer writer)
          throws TransformerConfigurationException {
    SAXTransformerFactory factory = (SAXTransformerFactory)
            SAXTransformerFactory.newInstance();
    TransformerHandler handler = factory.newTransformerHandler();
    handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
    handler.setResult(new StreamResult(writer));
    return new ContentHandlerDecorator(handler) {
      @Override
      public void startElement(
              String uri, String localName, String name, Attributes atts)
              throws SAXException {
        if (XHTMLContentHandler.XHTML.equals(uri)) {
          uri = null;
        }
        if (!"head".equals(localName)) {
          super.startElement(uri, localName, name, atts);
        }
      }

      @Override
      public void endElement(String uri, String localName, String name)
              throws SAXException {
        if (XHTMLContentHandler.XHTML.equals(uri)) {
          uri = null;
        }
        if (!"head".equals(localName)) {
          super.endElement(uri, localName, name);
        }
      }

      @Override
      public void startPrefixMapping(String prefix, String uri) {/*no op*/ }

      @Override
      public void endPrefixMapping(String prefix) {/*no op*/ }
    };
  }

  private static ContentHandler getTextContentHandler(Writer writer) {
    return new BodyContentHandler(writer);
  }

  private static ContentHandler getXmlContentHandler(Writer writer)
          throws TransformerConfigurationException {
    SAXTransformerFactory factory = (SAXTransformerFactory)
            SAXTransformerFactory.newInstance();
    TransformerHandler handler = factory.newTransformerHandler();
    handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
    handler.setResult(new StreamResult(writer));
    return handler;
  }

}
TOP

Related Classes of org.apache.solr.handler.dataimport.TikaEntityProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.