Package org.apache.mahout.vectorizer.encoders

Examples of org.apache.mahout.vectorizer.encoders.Dictionary


    output.mkdirs();
    int numCats = Integer.parseInt(getOption("categories"));
    int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
    int threadCount = Integer.parseInt(getOption("threads", "20"));
    int poolSize = Integer.parseInt(getOption("poolSize", "5"));
    Dictionary asfDictionary = new Dictionary();
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    //We ran seq2encoded and split input already, so let's just build up the dictionary
    Configuration conf = new Configuration();
    PathFilter trainFilter = new PathFilter() {
      @Override
      public boolean accept(Path path) {
        return path.getName().contains("training");
      }
    };
    SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, trainFilter,
            null, true, conf);
    long numItems = 0;
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      asfDictionary.intern(next.getFirst().toString());
      numItems++;
    }

    System.out.printf("%d training files\n", numItems);


    SGDInfo info = new SGDInfo();

    iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, trainFilter,
            null, true, conf);
    int k = 0;
    while (iter.hasNext()) {
      Pair<Text, VectorWritable> next = iter.next();
      String ng = next.getFirst().toString();
      int actual = asfDictionary.intern(ng);
      //we already have encoded
      learningAlgorithm.train(actual, next.getSecond().get());
      k++;
      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
View Full Code Here


    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }

    Dictionary newsGroups = new Dictionary();

    NewsgroupHelper helper = new NewsgroupHelper();
    helper.getEncoder().setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());
    SGDInfo info = new SGDInfo();

    int k = 0;


    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);

      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
      learningAlgorithm.train(actual, v);

      k++;
View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }

    Dictionary newsGroups = new Dictionary();

    NewsgroupHelper helper = new NewsgroupHelper();
    helper.getEncoder().setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm =
        new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.println(files.size() + " training files");
    SGDInfo info = new SGDInfo();

    int k = 0;


    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);

      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
      learningAlgorithm.train(actual, v);

      k++;
View Full Code Here

TOP

Related Classes of org.apache.mahout.vectorizer.encoders.Dictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.