Package org.apache.mahout.df.data

Examples of org.apache.mahout.df.data.Data


  @Override
  protected void cleanup(Context context) throws IOException, InterruptedException {
    // prepare the data
    log.debug("partition: {} numInstances: {}", partition, instances.size());
   
    Data data = new Data(getDataset(), instances);
    Bagging bagging = new Bagging(getTreeBuilder(), data);
   
    TreeID key = new TreeID();
   
    log.debug("Building {} trees", nbTrees);
    SingleTreePredictions callback = null;
    int[] predictions = null;
    for (int treeId = 0; treeId < nbTrees; treeId++) {
      log.debug("Building tree number : {}", treeId);
      if (isOobEstimate() && !isNoOutput()) {
        callback = new SingleTreePredictions(data.size());
        predictions = callback.getPredictions();
      }
     
      Node tree = bagging.build(treeId, rng, callback);
     
View Full Code Here


   *           if the data is not set
   */
  public Node build(int treeId, Random rng, PredictionCallback callback) {
    log.debug("Bagging...");
    Arrays.fill(sampled, false);
    Data bag = data.bagging(rng, sampled);
   
    log.debug("Building...");
    Node tree = treeBuilder.build(rng, bag);
   
    // predict the label for the out-of-bag elements
View Full Code Here

      log.warn("attribute {} already selected in a parent node", best.attr);
    }
   
    Node childNode;
    if (data.getDataset().isNumerical(best.attr)) {
      Data loSubset = data.subset(Condition.lesser(best.attr, best.split));
      Node loChild = build(rng, loSubset);
     
      Data hiSubset = data.subset(Condition.greaterOrEquals(best.attr, best.split));
      Node hiChild = build(rng, hiSubset);
     
      childNode = new NumericalNode(best.attr, best.split, loChild, hiChild);
    } else { // CATEGORICAL attribute
      selected[best.attr] = true;
     
      double[] values = data.values(best.attr);
      Node[] childs = new Node[values.length];
     
      for (int index = 0; index < values.length; index++) {
        Data subset = data.subset(Condition.equals(best.attr, values[index]));
        childs[index] = build(rng, subset);
      }
     
      childNode = new CategoricalNode(best.attr, values, childs);
     
View Full Code Here

    double hy = entropy(data); // H(Y)
    double hyx = 0.0; // H(Y|X)
    double invDataSize = 1.0 / data.size();
   
    for (double value : values) {
      Data subset = data.subset(Condition.equals(attr, value));
      hyx += subset.size() * invDataSize * entropy(subset);
    }
   
    return hy - hyx;
  }
View Full Code Here

  protected double numericalIg(Data data, int attr, double split) {
    double hy = entropy(data);
    double invDataSize = 1.0 / data.size();
   
    // LO subset
    Data subset = data.subset(Condition.lesser(attr, split));
    hy -= subset.size() * invDataSize * entropy(subset);
   
    // HI subset
    subset = data.subset(Condition.greaterOrEquals(attr, split));
    hy -= subset.size() * invDataSize * entropy(subset);
   
    return hy;
  }
View Full Code Here

  @Override
  public void close() throws IOException {
    // prepare the data
    log.debug("partition: {} numInstances: {}", partition, instances.size());
   
    Data data = new Data(getDataset(), instances);
    Bagging bagging = new Bagging(getTreeBuilder(), data);
   
    TreeID key = new TreeID();
   
    log.debug("Building {} trees", nbTrees);
    SingleTreePredictions callback = null;
    int[] predictions = null;
    for (int treeId = 0; treeId < nbTrees; treeId++) {
      log.debug("Building tree number: {}", treeId);
      if (isOobEstimate() && !isNoOutput()) {
        callback = new SingleTreePredictions(data.size());
        predictions = callback.getPredictions();
      }
     
      Node tree = bagging.build(treeId, rng, callback);
     
View Full Code Here

    // all the vectors have the same label (0)
    double[][] temp = Utils.randomDoublesWithSameLabel(rng, descriptor, 100, 0);
    String[] sData = Utils.double2String(temp);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);
    DefaultIgSplit iG = new DefaultIgSplit();

    double expected = 0.0 - 1.0 * Math.log(1.0) / Math.log(2.0);
    assertEquals(expected, iG.entropy(data));

View Full Code Here

    IgSplit ref = new DefaultIgSplit();
    IgSplit opt = new OptIgSplit();

    Random rng = RandomUtils.getRandom();
    Data data = Utils.randomData(rng, nbAttributes, numInstances);

    for (int nloop = 0; nloop < n; nloop++) {
      int attr = rng.nextInt(data.getDataset().nbAttributes());
      // System.out.println("IsNumerical: " + data.dataset.isNumerical(attr));

      Split expected = ref.computeSplit(data, attr);
      Split actual = opt.computeSplit(data, attr);
View Full Code Here

    // store the data into a file
    String[] sData = Utils.double2String(source);
    Path dataPath = Utils.writeDataToTestFile(sData);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);

    JobConf jobConf = new JobConf();
    jobConf.setNumMapTasks(numMaps);

    // prepare a custom TreeBuilder that will classify each
View Full Code Here

  }
 
  protected static Data loadData(Configuration conf, Path dataPath, Dataset dataset) throws IOException {
    log.info("Loading the data...");
    FileSystem fs = dataPath.getFileSystem(conf);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
    log.info("Data Loaded");
   
    return data;
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.df.data.Data

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.