Examples of org.apache.mahout.common.StringTuple

org.apache.mahout.common.StringTuple
An Ordered List of Strings which can be used in a Hadoop Map/Reduce Job

        defaultCategory);
      
      String correctLabel = label;
      String classifiedLabel = result.getLabel();
      
      StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE);
      outputTuple.add(correctLabel);
      outputTuple.add(classifiedLabel);
      
      output.collect(outputTuple, new DoubleWritable(1.0));
    } catch (InvalidDatastoreException e) {
      throw new IOException(e.toString());
    }

View Full Code Here


  @Override
  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
      if (termAtt.termLength() > 0) {
        document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
      }
    }
    context.write(key, document);
  }

View Full Code Here

    throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
      return;
    }
    StringTuple value = it.next();


    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size


    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);


      do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0 && dictionary.containsKey(term)) { // ngram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      } while (sf.incrementToken());


      sf.end();
      sf.close();
    } else {
      for (String term : value.getEntries()) {
        if (term.length() > 0 && dictionary.containsKey(term)) { // unigram
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      }

View Full Code Here

    FileStatus[] statuses = fs.listStatus(output, PathFilters.logsCRCFilter());
    assertEquals(1, statuses.length);
    Path filePath = statuses[0].getPath();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, configuration);
    Text key = ClassUtils.instantiateAs((Class<? extends Text>) reader.getKeyClass(), Text.class);
    StringTuple value =
        ClassUtils.instantiateAs((Class<? extends StringTuple>) reader.getValueClass(), StringTuple.class);
    reader.next(key, value);
    assertEquals(documentId1, key.toString());
    assertEquals(Arrays.asList("test", "document", "processor"), value.getEntries());
    reader.next(key, value);
    assertEquals(documentId2, key.toString());
    assertEquals(Arrays.asList("another", "one"), value.getEntries());
  }

View Full Code Here

        @Override
        public boolean apply(String label, double sigmaJ) {
          double weight = Math.log((value.get() + alphaI) / (sigmaJSigmaK - sigmaJ + vocabCount));
          
          reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);
          StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
          normalizerTuple.add(label);
          try {
            output.collect(normalizerTuple, new DoubleWritable(weight));
          } catch (IOException e) {
            throw new IllegalStateException(e);
          } // output Sigma_j
          return true;
        }
      });
      
    } else {
      String label = key.stringAt(1);
      
      double dIJ = value.get();
      double denominator = 0.5 * (sigmaJSigmaK / vocabCount + dIJ * this.labelWeightSum.size());
      double weight = Math.log1p(-dIJ / denominator);
      
      reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);
      
      StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
      normalizerTuple.add(label);
      
      // output -D_ij
      output.collect(normalizerTuple, new DoubleWritable(weight));
      
    }

View Full Code Here

    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    wordList.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String token, int dKJ) {
        try {
          StringTuple tuple = new StringTuple();
          tuple.add(BayesConstants.WEIGHT);
          tuple.add(label);
          tuple.add(token);
          DoubleWritable f = new DoubleWritable(Math.log1p(dKJ) / lengthNormalisation);
          output.collect(tuple, f);
        } catch (IOException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });
    reporter.setStatus("Bayes Feature Mapper: Document Label: " + label);
    
    // Output Document Frequency per Word per Class
    // Corpus Document Frequency (FEATURE_COUNT)
    // Corpus Term Frequency (FEATURE_TF)
    wordList.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String token, int dKJ) {
        try {
          StringTuple dfTuple = new StringTuple();
          dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY);
          dfTuple.add(label);
          dfTuple.add(token);
          output.collect(dfTuple, ONE);
          
          StringTuple tokenCountTuple = new StringTuple();
          tokenCountTuple.add(BayesConstants.FEATURE_COUNT);
          tokenCountTuple.add(token);
          output.collect(tokenCountTuple, ONE);
          
          StringTuple tokenTfTuple = new StringTuple();
          tokenTfTuple.add(BayesConstants.FEATURE_TF);
          tokenTfTuple.add(token);
          output.collect(tokenTfTuple, new DoubleWritable(dKJ));
        } catch (IOException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });
    
    // output that we have seen the label to calculate the Count of Document per
    // class
    StringTuple labelCountTuple = new StringTuple();
    labelCountTuple.add(BayesConstants.LABEL_COUNT);
    labelCountTuple.add(label);
    output.collect(labelCountTuple, ONE);
  }

View Full Code Here

    super(StringTuple.class, true);
  }
  
  @Override
  public int compare(WritableComparable a, WritableComparable b) {
    StringTuple ta = (StringTuple) a;
    StringTuple tb = (StringTuple) b;
    Preconditions.checkArgument(ta.length() >= 2 && ta.length() <= 3 && tb.length() >= 2 && tb.length() <= 3,
                                "StringTuple length out of bounds");
    // token
    String tmpa = ta.length() == 2 ? ta.stringAt(1) : ta.stringAt(2);
    String tmpb = tb.length() == 2 ? tb.stringAt(1) : tb.stringAt(2);
    int cmp = tmpa.compareTo(tmpb);
    if (cmp != 0) {
      return cmp;
    }
    
    // type, FEATURE_TF first, then FEATURE_COUNT, then DF or anything else.
    cmp = ta.stringAt(0).compareTo(tb.stringAt(0));
    if (cmp != 0) {
      if (ta.stringAt(0).equals(BayesConstants.FEATURE_TF)) {
        return -1;
      }
      if (tb.stringAt(0).equals(BayesConstants.FEATURE_TF)) {
        return 1;
      }
      if (ta.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
        return -1;
      }
      if (tb.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
        return 1;
      }
      return cmp;
    }


    // label or empty.
    tmpa = ta.length() == 2 ? "" : ta.stringAt(1);
    tmpb = tb.length() == 2 ? "" : tb.stringAt(1);
    
    return tmpa.compareTo(tmpb);
  }

View Full Code Here

                                                                   PathType.GLOB,
                                                                   null,
                                                                   null,
                                                                   true,
                                                                   conf)) {
      StringTuple key = record.getFirst();
      DoubleWritable value = record.getSecond();
      datastore.loadFeatureWeight(key.stringAt(2), key.stringAt(1), value.get());
    }
  }

View Full Code Here

                                                                   null,
                                                                   null,
                                                                   true,
                                                                   conf)) {
      // Sum of weights for a Feature
      StringTuple key = record.getFirst();
      DoubleWritable value = record.getSecond();
      if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) {
        datastore.setSumFeatureWeight(key.stringAt(1), value.get());
        if (++count % 50000 == 0) {
          log.info("Read {} feature weights", count);
        }
      }
    }

View Full Code Here

                                                                   null,
                                                                   null,
                                                                   true,
                                                                   conf)) {
      // Sum of weights in a Label
      StringTuple key = record.getFirst();
      DoubleWritable value = record.getSecond();
      if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) {
        datastore.setSumLabelWeight(key.stringAt(1), value.get());
        if (++count % 10000 == 0) {
          log.info("Read {} label weights", count);
        }
      }
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.mahout.common.StringTuple

com.digitalpebble.behemoth.mahout.BehemothTokenizerMapper

com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper

org.apache.hadoop.io.Text

org.apache.mahout.classifier.bayes.io.SequenceFileModelReader

org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver

org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierMapper

org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper

org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper

org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper

org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureOutputFormat

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.