Package ivory.lsh.driver

Source Code of ivory.lsh.driver.RunComputeSignatures

package ivory.lsh.driver;

import ivory.core.RetrievalEnvironment;
import ivory.lsh.projection.ComputeSignaturesMinhash;
import ivory.lsh.projection.ComputeSignaturesRandom;
import ivory.lsh.projection.ComputeSignaturesSimhash;
import ivory.lsh.projection.WriteRandomVectors;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;


/**
*  A Hadoop task to compute signatures from document vectors.
*
* @author ferhanture
*
*
*/
public class RunComputeSignatures extends PwsimEnvironment implements Tool {

  public static final String[] RequiredParameters = {};
  private static final Logger sLogger = Logger.getLogger(RunComputeSignatures.class);
  private static Options options;

  private static void printUsage() {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp( "RunComputeSignatures", options );
    System.exit(-1);   
  }

  public int run(String[] args) throws Exception {
    CommandLine cmdline = parseArgs(args);
    if ( cmdline == null ) {
      printUsage();
      return -1;
    }

    numOfBits = Integer.parseInt(cmdline.getOptionValue(SIZE_OPTION));
    signatureType = cmdline.getOptionValue(TYPE_OPTION).toLowerCase();
    String dir = cmdline.getOptionValue(INDEX_OPTION);

    Configuration config = getConf();
    FileSystem fs = FileSystem.get(config);

    config.set("Ivory.IndexPath", dir);
    config.setInt("Ivory.NumOfBits", numOfBits);

    String type = (signatureType.charAt(0)+"").toUpperCase() + signatureType.substring(1, signatureType.length());    //capitalize first character
    RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
    String collName = env.readCollectionName();
    config.set("Ivory.CollectionName", collName);

    PwsimEnvironment.setClassTypes(signatureType, config);
    int batchSize = -1;
    if (cmdline.hasOption(BATCH_OPTION)) {
      batchSize = Integer.parseInt(cmdline.getOptionValue(BATCH_OPTION));
      if (batchSize > 0) {
        int numDocs = env.readCollectionDocumentCount();
        numBatchFiles = numDocs / batchSize;
        if(numDocs % batchSize > 0) numBatchFiles++;
        System.out.println("numBatchFiles: "+numBatchFiles);
        config.setInt("NumBatch", numBatchFiles);
      }
    }

    if (type.equals("Random")) {
      WriteRandomVectors writeRandomTask = new WriteRandomVectors(config);
      writeRandomTask.run();
      ComputeSignaturesRandom computeSignaturesTask = new ComputeSignaturesRandom(config);
      computeSignaturesTask.run();
    } else if(type.equals("Simhash")) {
      if (numOfBits != 64) {
        sLogger.info("Simhash signatures need to be 64 bits! Quitting...");
        System.exit(0);
      }
      ComputeSignaturesSimhash computeSignaturesTask = new ComputeSignaturesSimhash(config);
      computeSignaturesTask.run();
    } else //minhash
      ComputeSignaturesMinhash computeSignaturesTask = new ComputeSignaturesMinhash(config);
      computeSignaturesTask.run();
    }

    return 0;
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new RunComputeSignatures(), args);
    System.exit(res);
  }

  private static final String INDEX_OPTION = "index";
  private static final String SIZE_OPTION = "num_bits";
  private static final String TYPE_OPTION = "type";
  private static final String BATCH_OPTION = "batch_size";
  private static final String LIBJARS_OPTION = "libjars";

  @SuppressWarnings("static-access")
  private static CommandLine parseArgs(String[] args) {
    options = new Options();
    options.addOption(OptionBuilder.withDescription("path to collection index").withArgName("path").hasArg().isRequired().create(INDEX_OPTION));
    options.addOption(OptionBuilder.withDescription("number of bits per signature").withArgName("integer").hasArg().isRequired().create(SIZE_OPTION));
    options.addOption(OptionBuilder.withDescription("type of signature").withArgName("random|simhash|minhash").hasArg().isRequired().create(TYPE_OPTION));
    options.addOption(OptionBuilder.withDescription("batch size").withArgName("number of signatures in one output file").hasArg().create(BATCH_OPTION));
    options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return null;
    }
    return cmdline;
  }
}
TOP

Related Classes of ivory.lsh.driver.RunComputeSignatures

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.