Package ivory.lsh.projection

Source Code of ivory.lsh.projection.WriteRandomVectors$MyMapper0

package ivory.lsh.projection;

import ivory.core.RetrievalEnvironment;
import ivory.lsh.data.FloatAsBytesWritable;
import ivory.lsh.driver.PwsimEnvironment;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.array.ArrayListOfFloatsWritable;
import edu.umd.cloud9.util.PowerTool;

/**
* @author ferhanture
*
*         This class is a Hadoop task to write randomly generated vectors to a SequenceFile
*
*/
@SuppressWarnings("deprecation")
public class WriteRandomVectors extends PowerTool {

  public static final String[] RequiredParameters = {};
  private static final Logger sLogger = Logger.getLogger(WriteRandomVectors.class);

  public WriteRandomVectors(Configuration conf) {
    super(conf);
  }

  /**
   * @author ferhanture
   *
   *         Identity mapper that passes all work to Reducer. Enables multiple Reducers to write
   *         random vectors simultaneously.
   */
  public static class MyMapper0 extends MapReduceBase implements
      Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {

    public void map(IntWritable key, IntWritable value,
        OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException {
      output.collect(key, value);
    }

  }

  /**
   * @author ferhanture
   *
   *         (K=size of each random vector) needs to be set manually, as the number of terms (i.e.,
   *         vocabulary size) Ideally, this should be passed through the Configuration object, but
   *         that is not checked right now.
   *
   *         The size of a file is the number of random vectors to be written at a single file.
   *         Multiple files are used if necessary. The parameter FILESIZE should be set through the
   *         ComputeSignatures class' SIZE_OF_FILE field. FILESIZE should be the maximum number of
   *         random vectors such that the size of each file is small enough to fit into memory
   */
  public static class MyReducer0 extends MapReduceBase implements
      Reducer<IntWritable, IntWritable, IntWritable, FloatAsBytesWritable> {

    static int D, K;
    FloatAsBytesWritable v;
    IntWritable keyInt = new IntWritable();

    public void configure(JobConf conf) {
      // sLogger.setLevel(Level.DEBUG);
      D = conf.getInt("D", -1);
      K = conf.getInt("K", -1);
    }

    public void reduce(IntWritable key, Iterator<IntWritable> values,
        OutputCollector<IntWritable, FloatAsBytesWritable> output, Reporter reporter)
        throws IOException {
      for (int i = 0; i < D; i++) {
        int index = (D * key.get()) + i; // just some guaranteed-to-be-unique-for-each-reducer
                                         // number
        v = generateUnitRandomVectorAsBytes(K);
        sLogger.debug("vector " + index + " = " + v.size() + "\n--->" + v.get(0) + "," + v.get(1));
        keyInt.set(index);
        output.collect(keyInt, v);
      }
    }
  }

  @Override
  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  @Override
  public int runTool() throws Exception {
    int D, K;
    D = getConf().getInt("Ivory.NumOfBits", -1);
    String indexPath = getConf().get("Ivory.IndexPath");

    JobConf job = new JobConf(getConf(), WriteRandomVectors.class);
    FileSystem fs = FileSystem.get(job);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    K = (int) env.readCollectionTermCount();
    job.setJobName("WriteRandomVectors");

    if (D <= 0 || K <= 0) {
      throw new RuntimeException("parameters not read properly");
    }

    // Set parameters
    String inputPath = indexPath + "/files";
    String outputPath = PwsimEnvironment.getRandomVectorsDir(indexPath, D);
   
    if (fs.exists(new Path(outputPath))) {
      sLogger.info("Random vectors output path already exists! Quitting...");
      return 0;
    }
    int numMappers = 1;
    int numReducers = 1;

    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, new Path(inputPath),
        IntWritable.class, IntWritable.class);
    for (int i = 0; i < numReducers; i++) {
      writer.append(new IntWritable(i), new IntWritable(i));
    }
    writer.close();

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.set("mapred.child.java.opts", "-Xmx2048m");
    job.setInt("mapred.map.max.attempts", 10);
    job.setInt("mapred.reduce.max.attempts", 10);
    job.setInt("K", K);
    job.setInt("D", D);

    sLogger.info("Random vectors...");
    sLogger.info("Total number of vectors: " + D);
    sLogger.info("Vector size: " + K);
    sLogger.info("InputPath: " + inputPath);
    sLogger.info("outputPath: " + outputPath);

    job.setNumMapTasks(numMappers);
    job.setNumReduceTasks(numReducers);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(FloatAsBytesWritable.class);
    job.setMapperClass(MyMapper0.class);
    job.setReducerClass(MyReducer0.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    return 0;
  }

  /**
   *
   * This piece of code is based on a publicly available Java code by Kevin Wayne, as referenced
   * below.
   *
   * @param numSamples number of elements in each random vector
   * @return a random vector of numSamples random float values
   */
  public static ArrayListOfFloatsWritable generateUnitRandomVector(int numSamples) {
    /*************************************************************************
     * Author: Kevin Wayne Date: 8/20/04 Compilation: javac StdGaussian.java Execution: java
     * StdGaussian
     **************************************************************************/
    double r, x, y;
    ArrayListOfFloatsWritable vector = new ArrayListOfFloatsWritable(numSamples);
    vector.setSize(numSamples);

    double normalizationFactor = 0;
    for (int i = 0; i < numSamples; i++) {

      // find a uniform random point (x, y) inside unit circle
      do {
        x = 2.0 * Math.random() - 1.0;
        y = 2.0 * Math.random() - 1.0;
        r = x * x + y * y;
      } while (r > 1 || r == 0); // loop executed 4 / pi = 1.273.. times on average
      // http://en.wikipedia.org/wiki/Box-Muller_transform

      // apply the Box-Muller formula to get standard Gaussian z
      double f = (x * Math.sqrt(-2.0 * Math.log(r) / r));
      normalizationFactor += Math.pow(f, 2.0);
      vector.set(i, (float) f);
    }

    /* normalize vector */
    normalizationFactor = Math.sqrt(normalizationFactor);
    for (int i = 0; i < vector.size(); i++) {
      float val = vector.get(i);
      float newf = (float) (val / normalizationFactor);
      vector.set(i, newf);
    }
    return vector;
  }

  public static FloatAsBytesWritable generateUnitRandomVectorAsBytes(int numSamples) {
    double r, x, y;
    ArrayListOfFloatsWritable vector = new ArrayListOfFloatsWritable(numSamples);
    vector.setSize(numSamples);

    byte[] bytes = new byte[numSamples];
    float max = Float.MIN_VALUE;
    float min = Float.MAX_VALUE;

    for (int i = 0; i < numSamples; i++) {

      // find a uniform random point (x, y) inside unit circle
      do {
        x = 2.0 * Math.random() - 1.0;
        y = 2.0 * Math.random() - 1.0;
        r = x * x + y * y;
      } while (r > 1 || r == 0); // loop executed 4 / pi = 1.273.. times on average
      // http://en.wikipedia.org/wiki/Box-Muller_transform

      // apply the Box-Muller formula to get standard Gaussian z
      float f = (float) (x * Math.sqrt(-2.0 * Math.log(r) / r));
      vector.set(i, f);
      if (f > 0 && f > max) {
        max = f;
      } else if (f < 0 && f < min) {
        min = f;
      }

    }

    // System.out.println(max);
    // System.out.println(min);

    /* normalize vector */
    for (int i = 0; i < vector.size(); i++) {
      float val = vector.get(i);
      float normalized2one = 0.0f;
      // map values to [-1,1] range
      if (val > 0) {
        normalized2one = val / max;
      } else if (val < 0) {
        normalized2one = val / Math.abs(min);
      }
      // System.out.println("normalized to [-1,1]: "+val + "=>" + normalized2one);

      // quantize float to byte
      int byteInt = (int) (normalized2one * (Byte.MAX_VALUE + 1));

      byte b;
      if (byteInt > Byte.MAX_VALUE) {
        b = (byte) Byte.MAX_VALUE;
      } else {
        b = (byte) byteInt;
      }

      // store quantized byte value
      // System.out.println("quantized: "+normalized2one + "=>" + b);
      bytes[i] = b;
    }
    FloatAsBytesWritable vector2 = new FloatAsBytesWritable(bytes, max, min);

    // debugging
    // float sum = 0;
    // for(int i=0;i<numSamples;i++){
    // float f1 = vector.get(i);
    // float f2 = vector2.getAsFloat(i);
    // System.out.println(f1+" "+f2+" = "+Math.abs(f1-f2));
    // sum+=Math.pow(f1-f2,2);
    // }
    // System.out.println(Math.sqrt(sum));

    return vector2;
  }

}
TOP

Related Classes of ivory.lsh.projection.WriteRandomVectors$MyMapper0

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
y>