Package com.senseidb.indexing.hadoop.map

Source Code of com.senseidb.indexing.hadoop.map.SenseiMapper

/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved. 
*/
package com.senseidb.indexing.hadoop.map;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.URL;
import java.net.URLConnection;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.Version;
import org.json.JSONException;
import org.json.JSONObject;

import proj.zoie.api.ZoieSegmentReader;
import proj.zoie.api.indexing.AbstractZoieIndexable;
import proj.zoie.api.indexing.ZoieIndexable;
import proj.zoie.api.indexing.ZoieIndexable.IndexingReq;

import com.senseidb.conf.SchemaConverter;
import com.senseidb.conf.SenseiSchema;
import com.senseidb.indexing.DefaultJsonSchemaInterpreter;
import com.senseidb.indexing.JsonFilter;
import com.senseidb.indexing.ShardingStrategy;
import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;

public class SenseiMapper extends MapReduceBase implements Mapper<Object, Object, Shard, IntermediateForm> {

  private final static Logger logger = Logger.getLogger(SenseiMapper.class);
  private static DefaultJsonSchemaInterpreter _defaultInterpreter = null;
  private boolean _use_remote_schema = false;
  private volatile boolean _isConfigured = false;
  private Configuration _conf;
  private Shard[] _shards;
 
  private ShardingStrategy _shardingStategy;
  private MapInputConverter _converter;
 
  private static Analyzer analyzer;
 

   
   
    public void map(Object key, Object value,
                    OutputCollector<Shard, IntermediateForm> output,
                    Reporter reporter) throws IOException {
     
        if(_isConfigured == false)
        throw new IllegalStateException("Mapper's configure method wasn't sucessful. May not get the correct schema or Lucene Analyzer.");

        JSONObject json = null;
      try{
        json = _converter.getJsonInput(key, value, _conf);
        json = _converter.doFilter(json);
      }catch(Exception e){
        ExceptionUtils.printRootCauseStackTrace(e);
        throw new IllegalStateException("data conversion or filtering failed inside mapper. \n");
      }
     
     
      if( _defaultInterpreter == null)
        reporter.incrCounter("Map", "Interpreter_null", 1);
     
      if_defaultInterpreter != null && json != null && analyzer != null){

            ZoieIndexable indexable = _defaultInterpreter.convertAndInterpret(json);
           
            IndexingReq[] idxReqs = indexable.buildIndexingReqs();
            if(idxReqs.length>0){
              Document doc = idxReqs[0].getDocument();
              ZoieSegmentReader.fillDocumentID(doc, indexable.getUID());
             
                  if (indexable.isStorable()){
                    byte[] bytes = indexable.getStoreValue();
                    if (bytes!=null){
                      doc.add(new Field(AbstractZoieIndexable.DOCUMENT_STORE_FIELD,bytes));
                    }
                  }
             
              //now we have uid and lucene Doc;
              IntermediateForm form = new IntermediateForm();
              form.configure(_conf);
              form.process(doc, analyzer);
              form.closeWriter();

              int chosenShard = -1;
          try {
            chosenShard = _shardingStategy.caculateShard(_shards.length, json);
          } catch (JSONException e) {
            throw new IOException("sharding dose not work for mapper.");
          }
              if (chosenShard >= 0) {
                // insert into one shard
                output.collect(_shards[chosenShard], form);
              } else {
                throw new IOException("Chosen shard for insert must be >= 0. current shard is: " + chosenShard);
              }
            }
      }
       
    }
   

  @Override
  public void configure(JobConf job) {
    super.configure(job);
    _conf = job;
      _shards = Shard.getIndexShards(_conf);
   
    _shardingStategy =
            (ShardingStrategy) ReflectionUtils.newInstance(
        job.getClass(SenseiJobConfig.DISTRIBUTION_POLICY,
        DummyShardingStrategy.class, ShardingStrategy.class), job);
   
    _converter = (MapInputConverter) ReflectionUtils.newInstance(
        job.getClass(SenseiJobConfig.MAPINPUT_CONVERTER,
            DummyMapInputConverter.class, MapInputConverter.class), job);
   
    try {
      setSchema(job);
      setAnalyzer(job);         
     
      _isConfigured = true;
    } catch (Exception e) {
      e.printStackTrace();
      _isConfigured = false;
    }
    }
 
  private void setAnalyzer(JobConf conf) throws Exception{
   
    if(analyzer != null)
      return;
   
    String version = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER_VERSION);
    if(version == null)
       throw new IllegalStateException("version has not been specified");
   
    String analyzerName = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER);
    if(analyzerName == null)
       throw new IllegalStateException("analyzer name has not been specified");
   
    Class analyzerClass = Class.forName(analyzerName);
    Constructor constructor = analyzerClass.getConstructor(Version.class);
    analyzer = (Analyzer) constructor.newInstance((Version) Enum.valueOf((Class)Class.forName("org.apache.lucene.util.Version"), version));

  }

  private void setSchema(JobConf conf) throws Exception {

    String _schema_uri = null;
    String metadataFileName = conf.get(SenseiJobConfig.SCHEMA_FILE_URL);
   
    Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
    if (localFiles != null) {
      for (int i = 0; i < localFiles.length; i++) {
        String strFileName = localFiles[i].toString();
        if (strFileName.contains(conf.get(SenseiJobConfig.SCHEMA_FILE_URL))) {
          metadataFileName = strFileName;
          break;
        }
      }
    }
   
    if (metadataFileName != null && metadataFileName.length() > 0) {
      _schema_uri = "file:///" + metadataFileName;

      if (_defaultInterpreter == null) {
       
        logger.info("schema file is:" + _schema_uri);
        URL url = new URL(_schema_uri);
        URLConnection conn = url.openConnection();
        conn.connect();

        File xmlSchema = new File(url.toURI());
        if (!xmlSchema.exists()) {
          throw new ConfigurationException(
              "schema not file");
        }
        DocumentBuilderFactory dbf = DocumentBuilderFactory
            .newInstance();
        dbf.setIgnoringComments(true);
        DocumentBuilder db = dbf.newDocumentBuilder();
        org.w3c.dom.Document schemaXml = db
            .parse(xmlSchema);
        schemaXml.getDocumentElement().normalize();
        JSONObject schemaData = SchemaConverter
            .convert(schemaXml);

        SenseiSchema schema = SenseiSchema.build(schemaData);
        _defaultInterpreter = new DefaultJsonSchemaInterpreter(schema);
      }
    }
  }

}
TOP

Related Classes of com.senseidb.indexing.hadoop.map.SenseiMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.