Package com.senseidb.indexing.hadoop.job

Source Code of com.senseidb.indexing.hadoop.job.MapReduceJob

/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved. 
*/

package com.senseidb.indexing.hadoop.job;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.NumberFormat;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;

import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.map.SenseiMapper;
import com.senseidb.indexing.hadoop.reduce.FileSystemDirectory;
import com.senseidb.indexing.hadoop.reduce.IndexUpdateOutputFormat;
import com.senseidb.indexing.hadoop.reduce.SenseiCombiner;
import com.senseidb.indexing.hadoop.reduce.SenseiReducer;
import com.senseidb.indexing.hadoop.util.LuceneUtil;
import com.senseidb.indexing.hadoop.util.MRConfig;
import com.senseidb.indexing.hadoop.util.MRJobConfig;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;

public class MapReduceJob extends Configured {

  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  private static final Logger logger = Logger.getLogger(MapReduceJob.class);
 
    public JobConf createJob(Class MRClass) throws IOException, URISyntaxException {
       
        Configuration conf = getConf();
        Path[] inputPaths;
        Path outputPath;
        Shard[] shards = null;
      int numMapTasks = conf.getInt(MRJobConfig.NUM_MAPS, 2);
      int numShards = conf.getInt(SenseiJobConfig.NUM_SHARDS, 2);
//      inputPaths = FileInputFormat.getInputPaths(jobConf);
     
        String dirs = conf.get(SenseiJobConfig.INPUT_DIRS, null);
        logger.info("dirs:"+ dirs);
        String [] list = StringUtils.split(dirs);
        logger.info("length after split:"+ list.length);
        inputPaths = new Path[list.length];
        for (int i = 0; i < list.length; i++) {
          inputPaths[i] = new Path(StringUtils.unEscapeString(list[i]));
        }
        logger.info("path[0] is:" + inputPaths[0]);
             
      outputPath = new Path(conf.get(SenseiJobConfig.OUTPUT_DIR));
      String indexPath = conf.get(SenseiJobConfig.INDEX_PATH);
      String indexSubDirPrefix = conf.get(SenseiJobConfig.INDEX_SUBDIR_PREFIX, "");
      shards = createShards(indexPath, numShards, conf, indexSubDirPrefix);
     
        FileSystem fs = FileSystem.get(conf);
        String username = conf.get("hadoop.job.ugi");
        if (fs.exists(outputPath) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false))
            fs.delete(outputPath, true);
        if (fs.exists(new Path(indexPath)) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false))
            fs.delete(new Path(indexPath), true);
     
     
        // set the starting generation for each shard
        // when a reduce task fails, a new reduce task
        // has to know where to re-start
        setShardGeneration(conf, shards);

        Shard.setIndexShards(conf, shards);

        // MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size
        // (max buffer size = 1/2 * JobContext.IO_SORT_MB).
        // Here we half-en JobContext.IO_SORT_MB because we use the other half memory to
        // build an intermediate form/index in Combiner.
        conf.setInt(MRJobConfig.IO_SORT_MB,  conf.getInt(MRJobConfig.IO_SORT_MB, 100) / 2);
       
        // set the temp dir for the job;
        conf.set(MRConfig.TEMP_DIR, "${mapred.child.tmp}/hindex/");
        if (fs.exists(new Path(conf.get(MRConfig.TEMP_DIR))))
            fs.delete(new Path(conf.get(MRConfig.TEMP_DIR)), true);
        if(fs.exists(new Path("./tmp")))
          fs.delete(new Path("./tmp"), true);
       
        (new Trash(conf)).expunge()//empty trash;
       
       
        //always using compound file format to speed up;
        conf.setBoolean(SenseiJobConfig.USE_COMPOUND_FILE, true);
       
        String schemaFile = conf.get(SenseiJobConfig.SCHEMA_FILE_URL);
        if(schemaFile == null)
          throw new IOException("no schema file is found");
        else{
          logger.info("Adding schema file: " + conf.get(SenseiJobConfig.SCHEMA_FILE_URL));       
        DistributedCache.addCacheFile(new URI(schemaFile), conf);
        }

        // create the job configuration
        JobConf jobConf = new JobConf(conf, MRClass);
        if(jobConf.getJobName().length()<1)
          jobConf.setJobName(MRClass.getName() + "_"+ System.currentTimeMillis());

        // provided by application
        FileInputFormat.setInputPaths(jobConf, inputPaths);
        FileOutputFormat.setOutputPath(jobConf, outputPath);

        jobConf.setNumMapTasks(numMapTasks);

        // already set shards
        jobConf.setNumReduceTasks(shards.length);

        jobConf.setInputFormat(
            conf.getClass(SenseiJobConfig.INPUT_FORMAT, TextInputFormat.class, InputFormat.class));

        Path[] inputs = FileInputFormat.getInputPaths(jobConf);
        StringBuilder buffer = new StringBuilder(inputs[0].toString());
        for (int i = 1; i < inputs.length; i++) {
          buffer.append(",");
          buffer.append(inputs[i].toString());
        }
        logger.info("mapred.input.dir = " + buffer.toString());
        logger.info("mapreduce.output.fileoutputformat.outputdir = " +
                 FileOutputFormat.getOutputPath(jobConf).toString());
        logger.info("mapreduce.job.maps = " + jobConf.getNumMapTasks());
        logger.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks());
        logger.info(shards.length + " shards = " + conf.get(SenseiJobConfig.INDEX_SHARDS));
        logger.info("mapred.input.format.class = "
            + jobConf.getInputFormat().getClass().getName());
        logger.info("mapreduce.cluster.temp.dir = " + jobConf.get(MRConfig.TEMP_DIR));

        // set by the system
        jobConf.setMapOutputKeyClass(Shard.class);
        jobConf.setMapOutputValueClass(IntermediateForm.class);
        jobConf.setOutputKeyClass(Shard.class);
        jobConf.setOutputValueClass(Text.class);

        jobConf.setMapperClass(SenseiMapper.class);
        // no need for the partitioner.class;
        jobConf.setCombinerClass(SenseiCombiner.class);
        jobConf.setReducerClass(SenseiReducer.class);

        jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

        jobConf.setReduceSpeculativeExecution(false);
        return jobConf;
      }
   
    private static FileSystem getFileSystem(String user) {
        Configuration conf = new Configuration();
        conf.set("hadoop.job.ugi", user);
      try
      {
          return FileSystem.get(conf);
        }
        catch(IOException e)
        {
          throw new RuntimeException(e);   
        }
      }
   
    private static Shard[] createShards(String indexPath, int numShards,
        org.apache.hadoop.conf.Configuration conf, String indexSubDirPrefix) throws IOException {

      String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR;
      long versionNumber = -1;
      long generation = -1;

      FileSystem fs = FileSystem.get(conf);
      Path path = new Path(indexPath);

      if (fs.exists(path)) {
        FileStatus[] fileStatus = fs.listStatus(path);
        String[] shardNames = new String[fileStatus.length];
        int count = 0;
        for (int i = 0; i < fileStatus.length; i++) {
          if (fileStatus[i].isDir()) {
            shardNames[count] = fileStatus[i].getPath().getName();
            count++;
          }
        }
        Arrays.sort(shardNames, 0, count);

        Shard[] shards = new Shard[count >= numShards ? count : numShards];
        for (int i = 0; i < count; i++) {
          shards[i] =
              new Shard(versionNumber, parent + shardNames[i], generation);
        }

        int number = count;
        for (int i = count; i < numShards; i++) {
          String shardPath;
          while (true) {
            shardPath = parent + indexSubDirPrefix + NUMBER_FORMAT.format(number++);
            if (!fs.exists(new Path(shardPath))) {
              break;
            }
          }
          shards[i] = new Shard(versionNumber, shardPath, generation);
        }
        return shards;
      } else {
        Shard[] shards = new Shard[numShards];
        for (int i = 0; i < shards.length; i++) {
          shards[i] =
              new Shard(versionNumber, parent + indexSubDirPrefix + NUMBER_FORMAT.format(i),
                  generation);
        }
        return shards;
      }
    }
   
   
    void setShardGeneration(Configuration conf, Shard[] shards)
          throws IOException {
        FileSystem fs = FileSystem.get(conf);

        for (int i = 0; i < shards.length; i++) {
          Path path = new Path(shards[i].getDirectory());
          long generation = -1;

          if (fs.exists(path)) {
            FileSystemDirectory dir = null;

            try {
              dir = new FileSystemDirectory(fs, path, false, conf);
              generation = LuceneUtil.getCurrentSegmentGeneration(dir);
            } finally {
              if (dir != null) {
                dir.close();
              }
            }
          }

          if (generation != shards[i].getGeneration()) {
            // set the starting generation for the shard
            shards[i] =
                new Shard(shards[i].getVersion(), shards[i].getDirectory(),
                    generation);
          }
        }
      }
}
TOP

Related Classes of com.senseidb.indexing.hadoop.job.MapReduceJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.