Package org.apache.mahout.classifier.df.data

Examples of org.apache.mahout.classifier.df.data.Data


  }
 
  protected static Data loadData(Configuration conf, Path dataPath, Dataset dataset) throws IOException {
    log.info("Loading the data...");
    FileSystem fs = dataPath.getFileSystem(conf);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
    log.info("Data Loaded");
   
    return data;
  }
View Full Code Here


    Node childNode;
    if (data.getDataset().isNumerical(best.getAttr())) {
      boolean[] temp = null;

      Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit()));
      Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit()));

      if (loSubset.isEmpty() || hiSubset.isEmpty()) {
        // the selected attribute did not change the data, avoid using it in the child notes
        selected[best.getAttr()] = true;
      } else {
        // the data changed, so we can unselect all previousely selected NUMERICAL attributes
        temp = selected;
        selected = cloneCategoricalAttributes(data.getDataset(), selected);
      }

      // size of the subset is less than the minSpitNum
      if (loSubset.size() < minSplitNum || hiSubset.size() < minSplitNum) {
        // branch is not split
        double label;
        if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
          label = sum / data.size();
        } else {
View Full Code Here

  @Override
  protected void cleanup(Context context) throws IOException, InterruptedException {
    // prepare the data
    log.debug("partition: {} numInstances: {}", partition, instances.size());
   
    Data data = new Data(getDataset(), instances);
    Bagging bagging = new Bagging(getTreeBuilder(), data);
   
    TreeID key = new TreeID();
   
    log.debug("Building {} trees", nbTrees);
View Full Code Here

    super.setup(context);
   
    Configuration conf = context.getConfiguration();
   
    log.info("Loading the data...");
    Data data = loadData(conf, getDataset());
    log.info("Data loaded : {} instances", data.size());
   
    bagging = new Bagging(getTreeBuilder(), data);
  }
View Full Code Here

    double hy = entropy(data); // H(Y)
    double hyx = 0.0; // H(Y|X)
    double invDataSize = 1.0 / data.size();
   
    for (double value : values) {
      Data subset = data.subset(Condition.equals(attr, value));
      hyx += subset.size() * invDataSize * entropy(subset);
    }
   
    return hy - hyx;
  }
View Full Code Here

  double numericalIg(Data data, int attr, double split) {
    double hy = entropy(data);
    double invDataSize = 1.0 / data.size();
   
    // LO subset
    Data subset = data.subset(Condition.lesser(attr, split));
    hy -= subset.size() * invDataSize * entropy(subset);
   
    // HI subset
    subset = data.subset(Condition.greaterOrEquals(attr, split));
    hy -= subset.size() * invDataSize * entropy(subset);
   
    return hy;
  }
View Full Code Here

   * Builds one tree
   */
  public Node build(Random rng) {
    log.debug("Bagging...");
    Arrays.fill(sampled, false);
    Data bag = data.bagging(rng, sampled);
   
    log.debug("Building...");
    return treeBuilder.build(rng, bag);
  }
View Full Code Here

  public void testComputeSplit() throws Exception {
    IgSplit ref = new DefaultIgSplit();
    IgSplit opt = new OptIgSplit();

    Random rng = RandomUtils.getRandom();
    Data data = Utils.randomData(rng, NUM_ATTRIBUTES, false, NUM_INSTANCES);

    for (int nloop = 0; nloop < 100; nloop++) {
      int attr = rng.nextInt(data.getDataset().nbAttributes());
      // System.out.println("IsNumerical: " + data.dataset.isNumerical(attr));

      Split expected = ref.computeSplit(data, attr);
      Split actual = opt.computeSplit(data, attr);
View Full Code Here

    String[] source = Utils.double2String(dData);
    String descriptor = "N N N N N N N N L";

    Dataset dataset = DataLoader.generateDataset(descriptor, false, source);
    Data data = DataLoader.loadData(dataset, source);
    TreeBuilder builder = new DecisionTreeBuilder();
    builder.build(rng, data);

    // regression
    dataset = DataLoader.generateDataset(descriptor, true, source);
View Full Code Here

    Node childNode;
    if (data.getDataset().isNumerical(best.getAttr())) {
      boolean[] temp = null;

      Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit()));
      Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit()));

      if (loSubset.isEmpty() || hiSubset.isEmpty()) {
        // the selected attribute did not change the data, avoid using it in the child notes
        selected[best.getAttr()] = true;
      } else {
        // the data changed, so we can unselect all previousely selected NUMERICAL attributes
        temp = selected;
        selected = cloneCategoricalAttributes(data.getDataset(), selected);
      }

      // size of the subset is less than the minSpitNum
      if (loSubset.size() < minSplitNum || hiSubset.size() < minSplitNum) {
        // branch is not split
        double label;
        if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
          label = sum / data.size();
        } else {
View Full Code Here

TOP

Related Classes of org.apache.mahout.classifier.df.data.Data

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.