Package org.apache.mahout.common

Examples of org.apache.mahout.common.IntPairWritable$FirstGroupingComparator


                                                     int numWordsToPrint) throws IOException {
    FileSystem fs = new Path(dir).getFileSystem(job);
   
    List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
   
    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "*"))) {
      Path path = status.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
      while (reader.next(key, value)) {
        int topic = key.getFirst();
        int word = key.getSecond();
       
        ensureQueueSize(queues, topic);
        if (word >= 0 && topic >= 0) {
          double score = value.get();
          String realWord = wordList.get(word);
View Full Code Here


      int w = e.index();
     
      for (int k = 0; k < state.numTopics; ++k) {
        v.set(doc.phi(k, w) + Math.log(e.get()));
       
        IntPairWritable kw = new IntPairWritable(k, w);
       
        // ouput (topic, word)'s logProb contribution
        context.write(kw, v);
        logTotals[k] = LDAUtil.logSum(logTotals[k], v.get());
      }
    }
   
    // Output the totals for the statistics. This is to make
    // normalizing a lot easier.
    for (int k = 0; k < state.numTopics; ++k) {
      IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY);
      v.set(logTotals[k]);
      assert !Double.isNaN(v.get());
      context.write(kw, v);
    }
    IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY);
    // Output log-likelihoods.
    v.set(doc.logLikelihood);
    context.write(llk, v);
  }
View Full Code Here

      SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, IntPairWritable.class,
          DoubleWritable.class);
     
      double total = 0.0; // total number of pseudo counts we made
      for (int w = 0; w < numWords; ++w) {
        IntPairWritable kw = new IntPairWritable(k, w);
        // A small amount of random noise, minimized by having a floor.
        double pseudocount = random.nextDouble() + 1.0E-8;
        total += pseudocount;
        v.set(Math.log(pseudocount));
        writer.append(kw, v);
      }
      IntPairWritable kTsk = new IntPairWritable(k, TOPIC_SUM_KEY);
      v.set(Math.log(total));
      writer.append(kTsk, v);
     
      writer.close();
    }
View Full Code Here

    Path dir = new Path(statePath);
    FileSystem fs = dir.getFileSystem(job);
   
    double ll = 0.0;
   
    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
      Path path = status.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
      while (reader.next(key, value)) {
        if (key.getFirst() == LOG_LIKELIHOOD_KEY) {
          ll = value.get();
          break;
        }
      }
      reader.close();
View Full Code Here

   
    DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
    double[] logTotals = new double[numTopics];
    double ll = 0.0;
   
    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
      Path path = status.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
      while (reader.next(key, value)) {
        int topic = key.getFirst();
        int word = key.getSecond();
        if (word == TOPIC_SUM_KEY) {
          logTotals[topic] = value.get();
          if (Double.isInfinite(value.get())) {
            throw new IllegalArgumentException();
          }
View Full Code Here

      int w = e.index();
     
      for (int k = 0; k < state.getNumTopics(); ++k) {
        v.set(doc.phi(k, w) + Math.log(e.get()));
       
        IntPairWritable kw = new IntPairWritable(k, w);
       
        // ouput (topic, word)'s logProb contribution
        context.write(kw, v);
        logTotals[k] = LDAUtil.logSum(logTotals[k], v.get());
      }
    }
   
    // Output the totals for the statistics. This is to make
    // normalizing a lot easier.
    for (int k = 0; k < state.getNumTopics(); ++k) {
      IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY);
      v.set(logTotals[k]);
      assert !Double.isNaN(v.get());
      context.write(kw, v);
    }
    IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY);
    // Output log-likelihoods.
    v.set(doc.getLogLikelihood());
    context.write(llk, v);
  }
View Full Code Here

    DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
    double[] logTotals = new double[numTopics];
    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
      Path path = status.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
      while (reader.next(key, value)) {
        int topic = key.getFirst();
        int word = key.getSecond();
        if (word == TOPIC_SUM_KEY) {
          logTotals[topic] = value.get();
          Preconditions.checkArgument(!Double.isInfinite(value.get()));
        } else if (topic == LOG_LIKELIHOOD_KEY) {
          ll = value.get();
View Full Code Here

      Path path = new Path(statePath, "part-" + k);
      SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, IntPairWritable.class, DoubleWritable.class);

      double total = 0.0; // total number of pseudo counts we made
      for (int w = 0; w < numWords; ++w) {
        Writable kw = new IntPairWritable(k, w);
        // A small amount of random noise, minimized by having a floor.
        double pseudocount = random.nextDouble() + 1.0E-8;
        total += pseudocount;
        v.set(Math.log(pseudocount));
        writer.append(kw, v);
      }
      Writable kTsk = new IntPairWritable(k, TOPIC_SUM_KEY);
      v.set(Math.log(total));
      writer.append(kTsk, v);

      writer.close();
    }
View Full Code Here

  private static double findLL(Path statePath, Configuration job) throws IOException {
    FileSystem fs = statePath.getFileSystem(job);

    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) {
      Path path = status.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
      while (reader.next(key, value)) {
        if (key.getFirst() == LOG_LIKELIHOOD_KEY) {
          ll = value.get();
          break;
        }
      }
      reader.close();
View Full Code Here

                                                                        PathType.GLOB,
                                                                        null,
                                                                        null,
                                                                        true,
                                                                        job)) {
      IntPairWritable key = record.getFirst();
      DoubleWritable value = record.getSecond();
      int topic = key.getFirst();
      int word = key.getSecond();
      if (word == TOPIC_SUM_KEY) {
        logTotals[topic] = value.get();
        Preconditions.checkArgument(!Double.isInfinite(value.get()));
      } else if (topic == LOG_LIKELIHOOD_KEY) {
        ll = value.get();
View Full Code Here

TOP

Related Classes of org.apache.mahout.common.IntPairWritable$FirstGroupingComparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.