Source Code of org.apache.hadoop.contrib.index.main.UpdateIndex

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.contrib.index.main;


import java.io.IOException;
import java.text.NumberFormat;
import java.util.Arrays;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration;
import org.apache.hadoop.contrib.index.mapred.IIndexUpdater;
import org.apache.hadoop.contrib.index.mapred.Shard;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.util.ReflectionUtils;


/**
 * A distributed "index" is partitioned into "shards". Each shard corresponds
 * to a Lucene instance. This class contains the main() method which uses a
 * Map/Reduce job to analyze documents and update Lucene instances in parallel.
 * 
 * The main() method in UpdateIndex requires the following information for
 * updating the shards:
 *   - Input formatter. This specifies how to format the input documents.
 *   - Analysis. This defines the analyzer to use on the input. The analyzer
 *     determines whether a document is being inserted, updated, or deleted.
 *     For inserts or updates, the analyzer also converts each input document
 *     into a Lucene document.
 *   - Input paths. This provides the location(s) of updated documents,
 *     e.g., HDFS files or directories, or HBase tables.
 *   - Shard paths, or index path with the number of shards. Either specify
 *     the path for each shard, or specify an index path and the shards are
 *     the sub-directories of the index directory.
 *   - Output path. When the update to a shard is done, a message is put here.
 *   - Number of map tasks.
 *
 * All of the information can be specified in a configuration file. All but
 * the first two can also be specified as command line options. Check out
 * conf/index-config.xml.template for other configurable parameters.
 *
 * Note: Because of the parallel nature of Map/Reduce, the behaviour of
 * multiple inserts, deletes or updates to the same document is undefined.
 */
public class UpdateIndex {
  public static final Log LOG = LogFactory.getLog(UpdateIndex.class);


  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }


  private static long now() {
    return System.currentTimeMillis();
  }


  private static void printUsage(String cmd) {
    System.err.println("Usage: java " + UpdateIndex.class.getName() + "\n"
        + "                        -inputPaths <inputPath,inputPath>\n"
        + "                        -outputPath <outputPath>\n"
        + "                        -shards     <shardDir,shardDir>\n"
        + "                        -indexPath  <indexPath>\n"
        + "                        -numShards  <num>\n"
        + "                        -numMapTasks <num>\n"
        + "                        -conf       <confPath>\n"
        + "Note: Do not use both -shards option and -indexPath option.");
  }


  private static String getIndexPath(Configuration conf) {
    return conf.get("sea.index.path");
  }


  private static int getNumShards(Configuration conf) {
    return conf.getInt("sea.num.shards", 1);
  }


  private static Shard[] createShards(String indexPath, int numShards,
      Configuration conf) throws IOException {


    String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR;
    long versionNumber = -1;
    long generation = -1;


    FileSystem fs = FileSystem.get(conf);
    Path path = new Path(indexPath);


    if (fs.exists(path)) {
      FileStatus[] fileStatus = fs.listStatus(path);
      String[] shardNames = new String[fileStatus.length];
      int count = 0;
      for (int i = 0; i < fileStatus.length; i++) {
        if (fileStatus[i].isDir()) {
          shardNames[count] = fileStatus[i].getPath().getName();
          count++;
        }
      }
      Arrays.sort(shardNames, 0, count);


      Shard[] shards = new Shard[count >= numShards ? count : numShards];
      for (int i = 0; i < count; i++) {
        shards[i] =
            new Shard(versionNumber, parent + shardNames[i], generation);
      }


      int number = count;
      for (int i = count; i < numShards; i++) {
        String shardPath;
        while (true) {
          shardPath = parent + NUMBER_FORMAT.format(number++);
          if (!fs.exists(new Path(shardPath))) {
            break;
          }
        }
        shards[i] = new Shard(versionNumber, shardPath, generation);
      }
      return shards;
    } else {
      Shard[] shards = new Shard[numShards];
      for (int i = 0; i < shards.length; i++) {
        shards[i] =
            new Shard(versionNumber, parent + NUMBER_FORMAT.format(i),
                generation);
      }
      return shards;
    }
  }


  /**
   * The main() method
   * @param argv
   */
  public static void main(String[] argv) {
    if (argv.length == 0) {
      printUsage("");
      System.exit(-1);
    }


    String inputPathsString = null;
    Path outputPath = null;
    String shardsString = null;
    String indexPath = null;
    int numShards = -1;
    int numMapTasks = -1;
    Configuration conf = new Configuration();
    String confPath = null;


    // parse the command line
    for (int i = 0; i < argv.length; i++) { // parse command line
      if (argv[i].equals("-inputPaths")) {
        inputPathsString = argv[++i];
      } else if (argv[i].equals("-outputPath")) {
        outputPath = new Path(argv[++i]);
      } else if (argv[i].equals("-shards")) {
        shardsString = argv[++i];
      } else if (argv[i].equals("-indexPath")) {
        indexPath = argv[++i];
      } else if (argv[i].equals("-numShards")) {
        numShards = Integer.parseInt(argv[++i]);
      } else if (argv[i].equals("-numMapTasks")) {
        numMapTasks = Integer.parseInt(argv[++i]);
      } else if (argv[i].equals("-conf")) {
        // add as a local FS resource
        confPath = argv[++i];
        conf.addResource(new Path(confPath));
      } else {
        System.out.println("Unknown option " + argv[i] + " w/ value "
            + argv[++i]);
      }
    }
    LOG.info("inputPaths = " + inputPathsString);
    LOG.info("outputPath = " + outputPath);
    LOG.info("shards     = " + shardsString);
    LOG.info("indexPath  = " + indexPath);
    LOG.info("numShards  = " + numShards);
    LOG.info("numMapTasks= " + numMapTasks);
    LOG.info("confPath   = " + confPath);


    Path[] inputPaths = null;
    Shard[] shards = null;


    JobConf jobConf = new JobConf(conf);
    IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(jobConf);


    if (inputPathsString != null) {
      jobConf.set("mapred.input.dir", inputPathsString);
    }
    inputPaths = FileInputFormat.getInputPaths(jobConf);
    if (inputPaths.length == 0) {
      inputPaths = null;
    }


    if (outputPath == null) {
      outputPath = FileOutputFormat.getOutputPath(jobConf);
    }


    if (inputPaths == null || outputPath == null) {
      System.err.println("InputPaths and outputPath must be specified.");
      printUsage("");
      System.exit(-1);
    }


    if (shardsString != null) {
      iconf.setIndexShards(shardsString);
    }
    shards = Shard.getIndexShards(iconf);
    if (shards != null && shards.length == 0) {
      shards = null;
    }


    if (indexPath == null) {
      indexPath = getIndexPath(conf);
    }
    if (numShards <= 0) {
      numShards = getNumShards(conf);
    }


    if (shards == null && indexPath == null) {
      System.err.println("Either shards or indexPath must be specified.");
      printUsage("");
      System.exit(-1);
    }


    if (numMapTasks <= 0) {
      numMapTasks = jobConf.getNumMapTasks();
    }


    try {
      // create shards and set their directories if necessary
      if (shards == null) {
        shards = createShards(indexPath, numShards, conf);
      }


      long startTime = now();
      try {
        IIndexUpdater updater =
            (IIndexUpdater) ReflectionUtils.newInstance(
                iconf.getIndexUpdaterClass(), conf);
        LOG.info("sea.index.updater = "
            + iconf.getIndexUpdaterClass().getName());


        updater.run(conf, inputPaths, outputPath, numMapTasks, shards);
        LOG.info("Index update job is done");


      } finally {
        long elapsedTime = now() - startTime;
        LOG.info("Elapsed time is  " + (elapsedTime / 1000) + "s");
        System.out.println("Elapsed time is " + (elapsedTime / 1000) + "s");
      }
    } catch (Exception e) {
      e.printStackTrace(System.err);
    }
  }
}
Source Code of org.apache.hadoop.contrib.index.main.UpdateIndex

Related Classes of org.apache.hadoop.contrib.index.main.UpdateIndex