Package org.apache.mahout.common

Examples of org.apache.mahout.common.StringTuple


    Text key = new Text();
    key.set("dummy-key");
   
    String[] input = {"the", "best", "of", "times", "the", "worst", "of",
    "times"};
    StringTuple inputTuple = new StringTuple();
    for (String i : input) {
      inputTuple.add(i);
    }
   
    String[][] values = { {"h_the", "the best"},
                          {"t_best", "the best"},
                          {"h_of", "of times"},
View Full Code Here


    Text key = new Text();
    key.set("dummy-key");
   
    String[] input = {"the", "best", "of", "times", "the", "worst", "of",
    "times"};
    StringTuple inputTuple = new StringTuple();
    for (String i : input) {
      inputTuple.add(i);
    }
   
    String[][] values = {{"h_the", "the best"},
                                         {"t_best", "the best"},
                                         {"h_of", "of times"},
View Full Code Here

                     OutputCollector<Text,VectorWritable> output,
                     Reporter reporter) throws IOException {
    if (values.hasNext() == false) {
      return;
    }
    StringTuple value = values.next();
   
    Vector vector = new RandomAccessSparseVector(key.toString(), dimension, value.length()); // guess at
                                                                                             // initial size
   
    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
          maxNGramSize);
     
      do {
        String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
        if (term.length() > 0) { // ngram
          if (dictionary.containsKey(term) == false) {
            continue;
          }
          int termId = dictionary.get(term);
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      } while (sf.incrementToken());
     
      sf.end();
      sf.close();
    } else {
      for (String term : value.getEntries()) {
        if (term.length() > 0) { // unigram
          if (dictionary.containsKey(term) == false) {
            continue;
          }
          int termId = dictionary.get(term);
View Full Code Here

  @Override
  public void map(Text key, Text value,
                  OutputCollector<Text,StringTuple> output, Reporter reporter) throws IOException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
      if (termAtt.termLength() > 0) {
        document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
      }
    }
    output.collect(key, document);
  }
View Full Code Here

        @Override
        public boolean apply(String label, double sigmaJ) {
          double weight = Math.log((value.get() + alphaI) / (sigmaJSigmaK - sigmaJ + vocabCount));
         
          reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);
          StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
          normalizerTuple.add(label);
          try {
            output.collect(normalizerTuple, new DoubleWritable(weight));
          } catch (IOException e) {
            throw new IllegalStateException(e);
          } // output Sigma_j
          return true;
        }
      });
     
    } else {
      String label = key.stringAt(1);
     
      double dIJ = value.get();
      double denominator = 0.5 * (sigmaJSigmaK / vocabCount + dIJ * this.labelWeightSum.size());
      double weight = Math.log(1.0 - dIJ / denominator);
     
      reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);
     
      StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
      normalizerTuple.add(label);
     
      // output -D_ij
      output.collect(normalizerTuple, new DoubleWritable(weight));
     
    }
View Full Code Here

    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    wordList.forEachPair(new ObjectIntProcedure<String>() {
      @Override
      public boolean apply(String token, int dKJ) {
        try {
          StringTuple tuple = new StringTuple();
          tuple.add(BayesConstants.WEIGHT);
          tuple.add(label);
          tuple.add(token);
          DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation);
          output.collect(tuple, f);
        } catch (IOException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });
    reporter.setStatus("Bayes Feature Mapper: Document Label: " + label);
   
    // Output Document Frequency per Word per Class
    wordList.forEachKey(new ObjectProcedure<String>() {
      @Override
      public boolean apply(String token) {
        try {
          StringTuple dfTuple = new StringTuple();
          dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY);
          dfTuple.add(label);
          dfTuple.add(token);
          output.collect(dfTuple, ONE);
         
          StringTuple tokenCountTuple = new StringTuple();
          tokenCountTuple.add(BayesConstants.FEATURE_COUNT);
          tokenCountTuple.add(token);
          output.collect(tokenCountTuple, ONE);
        } catch (IOException e) {
          throw new IllegalStateException(e);
        }
        return true;
      }
    });
   
    // output that we have seen the label to calculate the Count of Document per
    // class
    StringTuple labelCountTuple = new StringTuple();
    labelCountTuple.add(BayesConstants.LABEL_COUNT);
    labelCountTuple.add(label);
    output.collect(labelCountTuple, ONE);
  }
View Full Code Here

                                                                                InterruptedException {
    HashSet<String> outputValues = new HashSet<String>();
    for (StringTuple value : values) {
      outputValues.addAll(value.getEntries());
    }
    context.write(key, new StringTuple(outputValues));
  }
View Full Code Here

    List<String> oValue = new ArrayList<String>();
    for (int selectedField : selectedFields) {
      oValue.add(fields[selectedField]);
    }
   
    context.write(new Text(oKey.toString()), new StringTuple(oValue));
   
  }
View Full Code Here

    String label = key.stringAt(1);
   
    reporter.setStatus("Bayes Theta Normalizer Mapper: " + label);
   
    double weight = Math.log((value.get() + alphaI) / (labelWeightSum.get(label) + vocabCount));
    StringTuple thetaNormalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
    thetaNormalizerTuple.add(label);
    output.collect(thetaNormalizerTuple, new DoubleWritable(weight));
  }
View Full Code Here

  public static ConfusionMatrix readResult(FileSystem fs,
                                           Path pathPattern,
                                           Configuration conf,
                                           Parameters params) throws IOException {
   
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();
    String defaultLabel = params.get("defaultCat");
    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    Map<String,Map<String,Integer>> confusionMatrix = new HashMap<String,Map<String,Integer>>();
   
    for (FileStatus fileStatus : outputFiles) {
      Path path = fileStatus.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
      while (reader.next(key, value)) {
        String correctLabel = key.stringAt(1);
        String classifiedLabel = key.stringAt(2);
        Map<String,Integer> rowMatrix = confusionMatrix.get(correctLabel);
        if (rowMatrix == null) {
          rowMatrix = new HashMap<String,Integer>();
        }
        Integer count = Double.valueOf(value.get()).intValue();
View Full Code Here

TOP

Related Classes of org.apache.mahout.common.StringTuple

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.