Package org.apache.mahout.df.data

Examples of org.apache.mahout.df.data.Data


  }
 
  protected static Data loadData(Configuration conf, Path dataPath, Dataset dataset) throws IOException {
    log.info("Loading the data...");
    FileSystem fs = dataPath.getFileSystem(conf);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
    log.info("Data Loaded");
   
    return data;
  }
View Full Code Here


  }
 
  protected static Data loadData(Configuration conf, Path dataPath, Dataset dataset) throws IOException {
    log.info("Loading the data...");
    FileSystem fs = dataPath.getFileSystem(conf);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
    log.info("Data Loaded");
   
    return data;
  }
View Full Code Here

  private void runIteration(Random rng, Data data, int m, int nbtrees) {
   
    int nblabels = data.getDataset().nblabels();
   
    log.info("Splitting the data");
    Data train = data.clone();
    Data test = train.rsplit(rng, (int) (data.size() * 0.1));
   
    int[] trainLabels = train.extractLabels();
    int[] testLabels = test.extractLabels();
   
    DefaultTreeBuilder treeBuilder = new DefaultTreeBuilder();
   
    SequentialBuilder forestBuilder = new SequentialBuilder(rng, treeBuilder, train);
   
    // grow a forest with m = log2(M)+1
    ForestPredictions errorM = new ForestPredictions(train.size(), nblabels); // oob error when using m =
                                                                              // log2(M)+1
    treeBuilder.setM(m);
   
    long time = System.currentTimeMillis();
    log.info("Growing a forest with m={}", m);
    DecisionForest forestM = forestBuilder.build(nbtrees, errorM);
    sumTimeM += System.currentTimeMillis() - time;
    numNodesM += forestM.nbNodes();
   
    double oobM = ErrorEstimate.errorRate(trainLabels, errorM.computePredictions(rng)); // oob error estimate
                                                                                        // when m = log2(M)+1
   
    // grow a forest with m=1
    ForestPredictions errorOne = new ForestPredictions(train.size(), nblabels); // oob error when using m = 1
    treeBuilder.setM(1);
   
    time = System.currentTimeMillis();
    log.info("Growing a forest with m=1");
    DecisionForest forestOne = forestBuilder.build(nbtrees, errorOne);
    sumTimeOne += System.currentTimeMillis() - time;
    numNodesOne += forestOne.nbNodes();
   
    double oobOne = ErrorEstimate.errorRate(trainLabels, errorOne.computePredictions(rng)); // oob error
                                                                                            // estimate when m
                                                                                            // = 1
   
    // compute the test set error (Selection Error), and mean tree error (One Tree Error),
    // using the lowest oob error forest
    ForestPredictions testError = new ForestPredictions(test.size(), nblabels); // test set error
    MeanTreeCollector treeError = new MeanTreeCollector(test, nbtrees); // mean tree error
   
    // compute the test set error using m=1 (Single Input Error)
    errorOne = new ForestPredictions(test.size(), nblabels);
   
    if (oobM < oobOne) {
      forestM.classify(test, new MultiCallback(testError, treeError));
      forestOne.classify(test, errorOne);
    } else {
View Full Code Here

    }
   
    // load the data
    FileSystem fs = dataPath.getFileSystem(new Configuration());
    Dataset dataset = Dataset.load(getConf(), datasetPath);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
   
    // take m to be the first integer less than log2(M) + 1, where M is the
    // number of inputs
    int m = (int) Math.floor(Maths.log(2, data.getDataset().nbAttributes()) + 1);
   
    Random rng = RandomUtils.getRandom();
    for (int iteration = 0; iteration < nbIterations; iteration++) {
      log.info("Iteration {}", iteration);
      runIteration(rng, data, m, nbTrees);
View Full Code Here

  }
 
  protected static Data loadData(Configuration conf, Path dataPath, Dataset dataset) throws IOException {
    log.info("Loading the data...");
    FileSystem fs = dataPath.getFileSystem(conf);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
    log.info("Data Loaded");
   
    return data;
  }
View Full Code Here

   *          tree identifier
   */
  public Node build(int treeId, Random rng, PredictionCallback callback) {
    log.debug("Bagging...");
    Arrays.fill(sampled, false);
    Data bag = data.bagging(rng, sampled);
   
    log.debug("Building...");
    Node tree = treeBuilder.build(rng, bag);
   
    // predict the label for the out-of-bag elements
View Full Code Here

    // all the vectors have the same label (0)
    double[][] temp = Utils.randomDoublesWithSameLabel(rng, descriptor, 100, 0);
    String[] sData = Utils.double2String(temp);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);
    DefaultIgSplit iG = new DefaultIgSplit();

    double expected = 0.0 - 1.0 * Math.log(1.0) / Math.log(2.0);
    assertEquals(expected, iG.entropy(data), EPSILON);

View Full Code Here

  public void testComputeSplit() throws Exception {
    IgSplit ref = new DefaultIgSplit();
    IgSplit opt = new OptIgSplit();

    Random rng = RandomUtils.getRandom();
    Data data = Utils.randomData(rng, NUM_ATTRIBUTES, NUM_INSTANCES);

    for (int nloop = 0; nloop < 100; nloop++) {
      int attr = rng.nextInt(data.getDataset().nbAttributes());
      // System.out.println("IsNumerical: " + data.dataset.isNumerical(attr));

      Split expected = ref.computeSplit(data, attr);
      Split actual = opt.computeSplit(data, attr);
View Full Code Here

  @Override
  protected void cleanup(Context context) throws IOException, InterruptedException {
    // prepare the data
    log.debug("partition: {} numInstances: {}", partition, instances.size());
   
    Data data = new Data(getDataset(), instances);
    Bagging bagging = new Bagging(getTreeBuilder(), data);
   
    TreeID key = new TreeID();
   
    log.debug("Building {} trees", nbTrees);
    SingleTreePredictions callback = null;
    int[] predictions = null;
    for (int treeId = 0; treeId < nbTrees; treeId++) {
      log.debug("Building tree number : {}", treeId);
      if (isOobEstimate() && !isNoOutput()) {
        callback = new SingleTreePredictions(data.size());
        predictions = callback.getPredictions();
      }
     
      Node tree = bagging.build(treeId, rng, callback);
     
View Full Code Here

    double hy = entropy(data); // H(Y)
    double hyx = 0.0; // H(Y|X)
    double invDataSize = 1.0 / data.size();
   
    for (double value : values) {
      Data subset = data.subset(Condition.equals(attr, value));
      hyx += subset.size() * invDataSize * entropy(subset);
    }
   
    return hy - hyx;
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.df.data.Data

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.