Examples of org.apache.mahout.vectorizer.encoders.Dictionary

Package org.apache.mahout.vectorizer.encoders

Examples of org.apache.mahout.vectorizer.encoders.Dictionary

org.apache.mahout.vectorizer.encoders.Dictionary
Assigns integer codes to strings as they appear.

    output.mkdirs();
    int numCats = Integer.parseInt(getOption("categories"));
    int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
    int threadCount = Integer.parseInt(getOption("threads", "20"));
    int poolSize = Integer.parseInt(getOption("poolSize", "5"));
    Dictionary asfDictionary = new Dictionary();
    AdaptiveLogisticRegression learningAlgorithm =
        new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);


    //We ran seq2encoded and split input already, so let's just build up the dictionary
    Configuration conf = new Configuration();
    PathFilter trainFilter = new PathFilter() {
      @Override
      public boolean accept(Path path) {
        return path.getName().contains("training");
      }
    };
    SequenceFileDirIterator<Text, VectorWritable> iter =
        new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()),
                                                          PathType.LIST,
                                                          trainFilter,
                                                          null,
                                                          true,
                                                          conf);
    long numItems = 0;
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      asfDictionary.intern(next.getFirst().toString());
      numItems++;
    }


    System.out.println(numItems + " training files");




    SGDInfo info = new SGDInfo();


    iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, trainFilter,
            null, true, conf);
    int k = 0;
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      String ng = next.getFirst().toString();
      int actual = asfDictionary.intern(ng);
      //we already have encoded
      learningAlgorithm.train(actual, next.getSecond().get());
      k++;
      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();

View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }


    Dictionary newsGroups = new Dictionary();


    NewsgroupHelper helper = new NewsgroupHelper();
    helper.getEncoder().setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm =
        new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.println(files.size() + " training files");
    SGDInfo info = new SGDInfo();


    int k = 0;




    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);


      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
      learningAlgorithm.train(actual, v);


      k++;

View Full Code Here

    //contains the best model
    OnlineLogisticRegression classifier =
        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);




    Dictionary asfDictionary = new Dictionary();
    Configuration conf = new Configuration();
    PathFilter testFilter = new PathFilter() {
      @Override
      public boolean accept(Path path) {
        return path.getName().contains("test");
      }
    };
    SequenceFileDirIterator<Text, VectorWritable> iter =
        new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, testFilter,
        null, true, conf);


    long numItems = 0;
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      asfDictionary.intern(next.getFirst().toString());
      numItems++;
    }


    System.out.println(numItems + " test files");
    ResultAnalyzer ra = new ResultAnalyzer(asfDictionary.values(), "DEFAULT");
    iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, testFilter,
            null, true, conf);
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      String ng = next.getFirst().toString();


      int actual = asfDictionary.intern(ng);
      Vector result = classifier.classifyFull(next.getSecond().get());
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, next.getSecond().get());
      ClassifierResult cr = new ClassifierResult(asfDictionary.values().get(cat), score, ll);
      ra.addInstance(asfDictionary.values().get(actual), cr);


    }
    output.println(ra);
  }

View Full Code Here

    assertEquals(1.5849625007211563, v.maxValue(), 1.0e-6);
  }


  @Test
  public void testDictionaryOrder() {
    Dictionary dict = new Dictionary();


    dict.intern("a");
    dict.intern("d");
    dict.intern("c");
    dict.intern("b");
    dict.intern("qrz");


    assertEquals("[a, d, c, b, qrz]", dict.values().toString());


    Dictionary dict2 = Dictionary.fromList(dict.values());
    assertEquals("[a, d, c, b, qrz]", dict2.values().toString());


  }

View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }

View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }


    Dictionary newsGroups = new Dictionary();


    encoder.setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());


    double averageLL = 0;
    double averageCorrect = 0;


    int k = 0;
    double step = 0;
    int[] bumps = {1, 2, 5};
    for (File file : files.subList(0, 3000)) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);


      Vector v = encodeFeatureVector(file, actual, leakType);
      learningAlgorithm.train(actual, v);


      k++;

View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }

View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }

View Full Code Here

    //contains the best model
    OnlineLogisticRegression classifier =
        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);




    Dictionary asfDictionary = new Dictionary();
    //<String> overallCounts = HashMultiset.create();
    Configuration conf = new Configuration();
    PathFilter testFilter = new PathFilter() {
      @Override
      public boolean accept(Path path) {
        return path.getName().contains("test");
      }
    };
    SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, testFilter,
            null, true, conf);


    long numItems = 0;
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      asfDictionary.intern(next.getFirst().toString());
      numItems++;
    }


    System.out.printf("%d test files\n", numItems);
    ResultAnalyzer ra = new ResultAnalyzer(asfDictionary.values(), "DEFAULT");
    iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, testFilter,
            null, true, conf);
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      String ng = next.getFirst().toString();


      int actual = asfDictionary.intern(ng);
      Vector result = classifier.classifyFull(next.getSecond().get());
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, next.getSecond().get());
      ClassifierResult cr = new ClassifierResult(asfDictionary.values().get(cat), score, ll);
      ra.addInstance(asfDictionary.values().get(actual), cr);


    }
    output.printf("%s\n\n", ra.toString());
  }

View Full Code Here

    File base = new File(inputFile);
    //contains the best model
    OnlineLogisticRegression classifier = ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);




    Dictionary newsGroups = new Dictionary();
    Multiset<String> overallCounts = HashMultiset.create();


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    System.out.printf("%d test files\n", files.size());
    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
    for (File file : files) {
      String ng = file.getParentFile().getName();


      int actual = newsGroups.intern(ng);
      NewsgroupHelper helper = new NewsgroupHelper();
      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts); //no leak type ensures this is a normal vector
      Vector result = classifier.classifyFull(input);
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, input);
      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
      ra.addInstance(newsGroups.values().get(actual), cr);


    }
    output.printf("%s\n\n", ra.toString());
  }

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.mahout.vectorizer.encoders.Dictionary

mia.classifier.ch14.TrainNewsGroups

mia.classifier.ch16.train.TrainNewsGroups

org.apache.mahout.classifier.sgd.CsvRecordFactory

org.apache.mahout.classifier.sgd.CsvRecordFactoryTest

org.apache.mahout.classifier.sgd.OnlineLogisticRegressionTest

org.apache.mahout.classifier.sgd.TestASFEmail

org.apache.mahout.classifier.sgd.TestNewsGroups

org.apache.mahout.classifier.sgd.TrainASFEmail

org.apache.mahout.classifier.sgd.TrainNewsGroups

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.