Package org.apache.mahout.common

Examples of org.apache.mahout.common.StringTuple


  }

  public static double readVocabCount(FileSystem fs, Path pathPattern,
      Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
      Path path = fileStatus.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
      // the key is *
      while (reader.next(key, value)) {
        if (weightSum.size() > 1) {
          throw new IOException("Incorrect vocabCount File");
        }
        if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
          weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get());
        }

      }
    }
View Full Code Here


    return iterator.hasNext();
  }

  @Override
  public List<String> next() {
    StringTuple transaction = iterator.next();
    return transaction.getEntries();
  }
View Full Code Here

    return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
  }
 
  @Override
  protected String generateFileNameForKeyValue(WritableComparable<?> k, Writable v, String name) {
    StringTuple key = (StringTuple) k;
   
    if (key.length() == 1 && key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
      return "trainer-vocabCount/" + name;
    } else {
      return "trainer-tfIdf/" + name;
    }
  }
View Full Code Here

    List<String> oValue = new ArrayList<String>();
    for (int selectedField : selectedFields) {
      oValue.add(fields[selectedField]);
    }
   
    context.write(new Text(oKey.toString()), new StringTuple(oValue));
   
  }
View Full Code Here

    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    wordList.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String token, int dKJ) {
        try {
          StringTuple tuple = new StringTuple();
          tuple.add(BayesConstants.WEIGHT);
          tuple.add(label);
          tuple.add(token);
          DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation);
          output.collect(tuple, f);
        } catch (IOException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });
    reporter.setStatus("Bayes Feature Mapper: Document Label: " + label);
   
    // Output Document Frequency per Word per Class
    // Corpus Document Frequency (FEATURE_COUNT)
    // Corpus Term Frequency (FEATURE_TF)
    wordList.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String token, int dKJ) {
        try {
          StringTuple dfTuple = new StringTuple();
          dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY);
          dfTuple.add(label);
          dfTuple.add(token);
          output.collect(dfTuple, ONE);
         
          StringTuple tokenCountTuple = new StringTuple();
          tokenCountTuple.add(BayesConstants.FEATURE_COUNT);
          tokenCountTuple.add(token);
          output.collect(tokenCountTuple, ONE);
         
          StringTuple tokenTfTuple = new StringTuple();
          tokenTfTuple.add(BayesConstants.FEATURE_TF);
          tokenTfTuple.add(token);
          output.collect(tokenTfTuple, new DoubleWritable(dKJ));
        } catch (IOException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });
   
    // output that we have seen the label to calculate the Count of Document per
    // class
    StringTuple labelCountTuple = new StringTuple();
    labelCountTuple.add(BayesConstants.LABEL_COUNT);
    labelCountTuple.add(label);
    output.collect(labelCountTuple, ONE);
  }
View Full Code Here

    return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
  }
 
  @Override
  protected String generateFileNameForKeyValue(WritableComparable<?> k, Writable v, String name) {
    StringTuple key = (StringTuple) k;
   
    if (key.length() == 1 && key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) {
      return "Sigma_kSigma_j/" + name;
    } else {
      if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) {
        return "Sigma_j/" + name;
      } else if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) {
        return "Sigma_k/" + name;
      } else {
        throw new IllegalArgumentException("Unexpected StringTuple: " + key);
      }
    }
View Full Code Here

    super(StringTuple.class, true);
  }
 
  @Override
  public int compare(WritableComparable a, WritableComparable b) {
    StringTuple ta = (StringTuple) a;
    StringTuple tb = (StringTuple) b;
    Preconditions.checkArgument(ta.length() >= 2 && ta.length() <= 3 && tb.length() >= 2 && tb.length() <= 3,
                                "StringTuple length out of bounds");
    // token
    String tmpa = ta.length() == 2 ? ta.stringAt(1) : ta.stringAt(2);
    String tmpb = tb.length() == 2 ? tb.stringAt(1) : tb.stringAt(2);
    int cmp = tmpa.compareTo(tmpb);
    if (cmp != 0) {
      return cmp;
    }
   
    // type, FEATURE_TF first, then FEATURE_COUNT, then DF or anything else.
    cmp = ta.stringAt(0).compareTo(tb.stringAt(0));
    if (cmp != 0) {
      if (ta.stringAt(0).equals(BayesConstants.FEATURE_TF)) {
        return -1;
      } else if (tb.stringAt(0).equals(BayesConstants.FEATURE_TF)) {
        return 1;
      } else if (ta.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
        return -1;
      } else if (tb.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
        return 1;
      }
      return cmp;
    }

    // label or empty.
    tmpa = ta.length() == 2 ? "" : ta.stringAt(1);
    tmpb = tb.length() == 2 ? "" : tb.stringAt(1);
   
    return tmpa.compareTo(tmpb);
  }
View Full Code Here

                                                                   PathType.GLOB,
                                                                   null,
                                                                   null,
                                                                   true,
                                                                   conf)) {
      StringTuple key = record.getFirst();
      DoubleWritable value = record.getSecond();
      datastore.loadFeatureWeight(key.stringAt(2), key.stringAt(1), value.get());
    }
  }
View Full Code Here

                                                                   null,
                                                                   null,
                                                                   true,
                                                                   conf)) {
      // Sum of weights for a Feature
      StringTuple key = record.getFirst();
      DoubleWritable value = record.getSecond();
      if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) {
        datastore.setSumFeatureWeight(key.stringAt(1), value.get());
        if (++count % 50000 == 0) {
          log.info("Read {} feature weights", count);
        }
      }
    }
View Full Code Here

                                                                   null,
                                                                   null,
                                                                   true,
                                                                   conf)) {
      // Sum of weights in a Label
      StringTuple key = record.getFirst();
      DoubleWritable value = record.getSecond();
      if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) {
        datastore.setSumLabelWeight(key.stringAt(1), value.get());
        if (++count % 10000 == 0) {
          log.info("Read {} label weights", count);
        }
      }
    }
View Full Code Here

TOP

Related Classes of org.apache.mahout.common.StringTuple

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.