Source Code of com.senseidb.indexing.hadoop.map.SenseiMapper

/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.  
 */
package com.senseidb.indexing.hadoop.map;


import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.URL;
import java.net.URLConnection;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;


import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.Version;
import org.json.JSONException;
import org.json.JSONObject;


import proj.zoie.api.ZoieSegmentReader;
import proj.zoie.api.indexing.AbstractZoieIndexable;
import proj.zoie.api.indexing.ZoieIndexable;
import proj.zoie.api.indexing.ZoieIndexable.IndexingReq;


import com.senseidb.conf.SchemaConverter;
import com.senseidb.conf.SenseiSchema;
import com.senseidb.indexing.DefaultJsonSchemaInterpreter;
import com.senseidb.indexing.JsonFilter;
import com.senseidb.indexing.ShardingStrategy;
import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;


public class SenseiMapper extends MapReduceBase implements Mapper<Object, Object, Shard, IntermediateForm> {


  private final static Logger logger = Logger.getLogger(SenseiMapper.class);
  private static DefaultJsonSchemaInterpreter _defaultInterpreter = null;
  private boolean _use_remote_schema = false;
  private volatile boolean _isConfigured = false;
  private Configuration _conf;
  private Shard[] _shards;
  
  private ShardingStrategy _shardingStategy;
  private MapInputConverter _converter;
  
  private static Analyzer analyzer;
  


    
    
    public void map(Object key, Object value, 
                    OutputCollector<Shard, IntermediateForm> output, 
                    Reporter reporter) throws IOException {
      
        if(_isConfigured == false)
        throw new IllegalStateException("Mapper's configure method wasn't sucessful. May not get the correct schema or Lucene Analyzer.");


        JSONObject json = null;
      try{
        json = _converter.getJsonInput(key, value, _conf);
        json = _converter.doFilter(json);
      }catch(Exception e){
        ExceptionUtils.printRootCauseStackTrace(e);
        throw new IllegalStateException("data conversion or filtering failed inside mapper. \n");
      }
      
      
      if( _defaultInterpreter == null)
        reporter.incrCounter("Map", "Interpreter_null", 1);
      
      if(  _defaultInterpreter != null && json != null && analyzer != null){


            ZoieIndexable indexable = _defaultInterpreter.convertAndInterpret(json);
            
            IndexingReq[] idxReqs = indexable.buildIndexingReqs();
            if(idxReqs.length>0){
              Document doc = idxReqs[0].getDocument();
              ZoieSegmentReader.fillDocumentID(doc, indexable.getUID());
              
                  if (indexable.isStorable()){
                    byte[] bytes = indexable.getStoreValue();
                    if (bytes!=null){
                      doc.add(new Field(AbstractZoieIndexable.DOCUMENT_STORE_FIELD,bytes));
                    }
                  }
              
              //now we have uid and lucene Doc;
              IntermediateForm form = new IntermediateForm();
              form.configure(_conf);
              form.process(doc, analyzer);
              form.closeWriter();


              int chosenShard = -1;
          try {
            chosenShard = _shardingStategy.caculateShard(_shards.length, json);
          } catch (JSONException e) {
            throw new IOException("sharding dose not work for mapper.");
          }
              if (chosenShard >= 0) {
                // insert into one shard
                output.collect(_shards[chosenShard], form);
              } else {
                throw new IOException("Chosen shard for insert must be >= 0. current shard is: " + chosenShard);
              }
            }
      }
        
    }
    


  @Override
  public void configure(JobConf job) {
    super.configure(job);
    _conf = job;
      _shards = Shard.getIndexShards(_conf);
    
    _shardingStategy =
            (ShardingStrategy) ReflectionUtils.newInstance(
        job.getClass(SenseiJobConfig.DISTRIBUTION_POLICY,
        DummyShardingStrategy.class, ShardingStrategy.class), job);
    
    _converter = (MapInputConverter) ReflectionUtils.newInstance(
        job.getClass(SenseiJobConfig.MAPINPUT_CONVERTER,
            DummyMapInputConverter.class, MapInputConverter.class), job);
    
    try {
      setSchema(job);
      setAnalyzer(job);          
      
      _isConfigured = true;
    } catch (Exception e) {
      e.printStackTrace();
      _isConfigured = false;
    }
    }
  
  private void setAnalyzer(JobConf conf) throws Exception{
    
    if(analyzer != null)
      return;
    
    String version = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER_VERSION);
    if(version == null)
       throw new IllegalStateException("version has not been specified");
    
    String analyzerName = _conf.get(SenseiJobConfig.DOCUMENT_ANALYZER);
    if(analyzerName == null)
       throw new IllegalStateException("analyzer name has not been specified");
    
    Class analyzerClass = Class.forName(analyzerName);
    Constructor constructor = analyzerClass.getConstructor(Version.class);
    analyzer = (Analyzer) constructor.newInstance((Version) Enum.valueOf((Class)Class.forName("org.apache.lucene.util.Version"), version));


  }


  private void setSchema(JobConf conf) throws Exception {


    String _schema_uri = null;
    String metadataFileName = conf.get(SenseiJobConfig.SCHEMA_FILE_URL);
    
    Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
    if (localFiles != null) {
      for (int i = 0; i < localFiles.length; i++) {
        String strFileName = localFiles[i].toString();
        if (strFileName.contains(conf.get(SenseiJobConfig.SCHEMA_FILE_URL))) {
          metadataFileName = strFileName;
          break;
        }
      }
    }
    
    if (metadataFileName != null && metadataFileName.length() > 0) {
      _schema_uri = "file:///" + metadataFileName;


      if (_defaultInterpreter == null) {
        
        logger.info("schema file is:" + _schema_uri);
        URL url = new URL(_schema_uri);
        URLConnection conn = url.openConnection();
        conn.connect();


        File xmlSchema = new File(url.toURI());
        if (!xmlSchema.exists()) {
          throw new ConfigurationException(
              "schema not file");
        }
        DocumentBuilderFactory dbf = DocumentBuilderFactory
            .newInstance();
        dbf.setIgnoringComments(true);
        DocumentBuilder db = dbf.newDocumentBuilder();
        org.w3c.dom.Document schemaXml = db
            .parse(xmlSchema);
        schemaXml.getDocumentElement().normalize();
        JSONObject schemaData = SchemaConverter
            .convert(schemaXml);


        SenseiSchema schema = SenseiSchema.build(schemaData);
        _defaultInterpreter = new DefaultJsonSchemaInterpreter(schema);
      }
    }
  }


}
Source Code of com.senseidb.indexing.hadoop.map.SenseiMapper

Related Classes of com.senseidb.indexing.hadoop.map.SenseiMapper