Package org.apache.mahout.df.data

Examples of org.apache.mahout.df.data.Data


  @Override
  public void close() throws IOException {
    // prepare the data
    log.debug("partition: " + partition + "numInstances: " + instances.size());
   
    Data data = new Data(getDataset(), instances);
    Bagging bagging = new Bagging(getTreeBuilder(), data);

    TreeID key = new TreeID();

    log.debug("Building " + nbTrees + " trees");
    SingleTreePredictions callback = null;
    int[] predictions = null;
    for (int treeId = 0; treeId < nbTrees; treeId++) {
      log.debug("Building tree N° : " + treeId);
      if (isOobEstimate() && !isNoOutput()) {
        callback = new SingleTreePredictions(data.size());
        predictions = callback.getPredictions();
      }

      Node tree = bagging.build(treeId, rng, callback);
View Full Code Here


    double hy = entropy(data); // H(Y)
    double hyx = 0.0; // H(Y|X)
    double invDataSize = 1.0 / data.size();

    for (double value : values) {
      Data subset = data.subset(Condition.equals(attr, value));
      hyx += subset.size() * invDataSize * entropy(subset);
    }

    return hy - hyx;
  }
View Full Code Here

  protected double numericalIg(Data data, int attr, double split) {
    double hy = entropy(data);
    double invDataSize = 1.0 / data.size();

    // LO subset
    Data subset = data.subset(Condition.lesser(attr, split));
    hy -= subset.size() * invDataSize * entropy(subset);

    // HI subset
    subset = data.subset(Condition.greaterOrEquals(attr, split));
    hy -= subset.size() * invDataSize * entropy(subset);

    return hy;
  }
View Full Code Here

  @Override
  protected void cleanup(Context context) throws IOException, InterruptedException {
    // prepare the data
    log.debug("partition: " + partition + "numInstances: " + instances.size());

    Data data = new Data(getDataset(), instances);
    Bagging bagging = new Bagging(getTreeBuilder(), data);

    TreeID key = new TreeID();

    log.debug("Building " + nbTrees + " trees");
    SingleTreePredictions callback = null;
    int[] predictions = null;
    for (int treeId = 0; treeId < nbTrees; treeId++) {
      log.debug("Building tree N° : " + treeId);
      if (isOobEstimate() && !isNoOutput()) {
        callback = new SingleTreePredictions(data.size());
        predictions = callback.getPredictions();
      }

      Node tree = bagging.build(treeId, rng, callback);
View Full Code Here

    IgSplit ref = new DefaultIgSplit();
    IgSplit opt = new OptIgSplit();

    Random rng = RandomUtils.getRandom();
    Data data = Utils.randomData(rng, nbAttributes, numInstances);

    for (int nloop = 0; nloop < n; nloop++) {
      int attr = rng.nextInt(data.getDataset().nbAttributes());
      // System.out.println("IsNumerical: " + data.dataset.isNumerical(attr));

      Split expected = ref.computeSplit(data, attr);
      Split actual = opt.computeSplit(data, attr);
View Full Code Here

    // store the data into a file
    String[] sData = Utils.double2String(source);
    Path dataPath = Utils.writeDataToTestFile(sData);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);

    JobConf jobConf = new JobConf();
    jobConf.setNumMapTasks(numMaps);

    // prepare a custom TreeBuilder that will classify each
View Full Code Here

    // store the data into a file
    String[] sData = Utils.double2String(source);
    Path dataPath = Utils.writeDataToTestFile(sData);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);

    Configuration conf = new Configuration();
    Step0JobTest.setMaxSplitSize(conf, dataPath, numMaps);

    // prepare a custom TreeBuilder that will classify each
View Full Code Here

    for (int nloop = 0; nloop < n; nloop++) {
      int nbAttributes = rng.nextInt(maxNbAttributes) + 1;

      // generate a small data, only to get the dataset
      Data data = Utils.randomData(rng, nbAttributes, 1);
      if (data.getDataset().nbAttributes() == 0)
        continue;

      int m = rng.nextInt(data.getDataset().nbAttributes()) + 1;

      int[] attrs = DefaultTreeBuilder.randomAttributes(data.getDataset(), rng, m);

      assertEquals(m, attrs.length);

      for (int index = 0; index < m; index++) {
        int attr = attrs[index];
View Full Code Here

    // all the vectors have the same label (0)
    double[][] temp = Utils.randomDoublesWithSameLabel(rng, descriptor, 100, 0);
    String[] sData = Utils.double2String(temp);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);
    DefaultIgSplit iG = new DefaultIgSplit();

    double expected = 0.0 - 1.0 * Math.log(1.0) / Math.log(2.0);
    assertEquals(expected, iG.entropy(data));

View Full Code Here

  }
 
  protected static Data loadData(Configuration conf, Path dataPath, Dataset dataset) throws IOException {
    log.info("Loading the data...");
    FileSystem fs = dataPath.getFileSystem(conf);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
    log.info("Data Loaded");
   
    return data;
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.df.data.Data

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.