Package ivory.lsh.driver

Source Code of ivory.lsh.driver.RunComputeSignatures

package ivory.lsh.driver;

import ivory.core.RetrievalEnvironment;
import ivory.lsh.projection.ComputeSignaturesMinhash;
import ivory.lsh.projection.ComputeSignaturesRandom;
import ivory.lsh.projection.ComputeSignaturesSimhash;
import ivory.lsh.projection.WriteRandomVectors;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;


/**
*  A Hadoop task to compute signatures from document vectors.
*
* @author ferhanture
*
*
*/
public class RunComputeSignatures extends PwsimEnvironment implements Tool {

  public static final String[] RequiredParameters = {};
  private static final Logger sLogger = Logger.getLogger(RunComputeSignatures.class);

  private static int printUsage() {
    System.out.println("usage: [index-path] [num-of-bits] [type-of-signature] ([batch-size]) ([dot-prod-thresholds])\nIgnore last two arguments if you don't know what they are. If you want non-batch mode but want to include 'dot product thresholds' file, enter X for [batch-size]");
    return -1;
  }

  public int run(String[] args) throws Exception {
    if(args.length!=3 && args.length!=4 && args.length!=5){
      printUsage();
      return -1;
    }
    boolean batchSizeGiven = (args.length>=4);
    numOfBits = Integer.parseInt(args[1]);
    signatureType = args[2].toLowerCase();
    String dir = args[0];

    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(config);

    config.set("Ivory.IndexPath", dir);
    config.setInt("Ivory.NumOfBits", numOfBits);
   
    String type = (signatureType.charAt(0)+"").toUpperCase()+signatureType.substring(1, signatureType.length());    //capitalize first character
    RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
    String collName = env.readCollectionName();
    config.set("Ivory.CollectionName", collName);
   
    PwsimEnvironment.setClassTypes(config);
    int batchSize = -1;
    try {
      if(batchSizeGiven){
        batchSize = Integer.parseInt(args[3]);
        if(batchSize>0){
          int numDocs = env.readCollectionDocumentCount();
          numBatchFiles = numDocs / batchSize;
          if(numDocs % batchSize > 0) numBatchFiles++;
          System.out.println("numBatchFiles: "+numBatchFiles);
          config.setInt("NumBatch", numBatchFiles);
        }
      }
    } catch (NumberFormatException e) {
      sLogger.info("Batch size not an integer! Running in regular (non-batch) mode...");
      batchSizeGiven = false;
    }

    if(type.equals("Random")){
      WriteRandomVectors writeRandomTask = new WriteRandomVectors(config);
      writeRandomTask.run();
      if(args.length==5){
        config.set("Ivory.DotProdThreshFile", args[4]);
      }
      ComputeSignaturesRandom computeSignaturesTask = new ComputeSignaturesRandom(config);
      computeSignaturesTask.run();
    }else if(type.equals("Simhash")){
      if(numOfBits!=64){
        sLogger.info("Simhash signatures need to be 64 bits! Quitting...");
        System.exit(0);
      }
      ComputeSignaturesSimhash computeSignaturesTask = new ComputeSignaturesSimhash(config);
      computeSignaturesTask.run();
    }else//minhash
      ComputeSignaturesMinhash computeSignaturesTask = new ComputeSignaturesMinhash(config);
      computeSignaturesTask.run();
    }
   
    return 0;
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new RunComputeSignatures(), args);
    System.exit(res);
  }

}
TOP

Related Classes of ivory.lsh.driver.RunComputeSignatures

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.