Examples of org.apache.mahout.classifier.df.data.Dataset

Package org.apache.mahout.classifier.df.data

Examples of org.apache.mahout.classifier.df.data.Dataset

org.apache.mahout.classifier.df.data.Dataset
Contains informations about the attributes.

    private int nblabels;
    
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      Dataset dataset = Builder.loadDataset(conf);
      setup(dataset.nblabels());
    }

View Full Code Here


    // prepare the data
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, false, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);
    Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
    String[][] splits = Utils.splitData(sData, NUM_MAPPERS);


    MockTreeBuilder treeBuilder = new MockTreeBuilder();


    LongWritable key = new LongWritable();

View Full Code Here

      partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
      files[p] = pfs.create(partPaths[p]);
    }
    
    Path datasetPath = new Path(datasetStr);
    Dataset dataset = Dataset.load(conf, datasetPath);
    
    // currents[label] = next partition file where to place the tuple
    int[] currents = new int[dataset.nblabels()];
    
    // currents is initialized randomly in the range [0, numpartitions[
    Random random = RandomUtils.getRandom();
    for (int c = 0; c < currents.length; c++) {
      currents[c] = random.nextInt(numPartitions);
    }
    
    // foreach tuple of the data
    Path dataPath = new Path(dataStr);
    FileSystem ifs = dataPath.getFileSystem(conf);
    FSDataInputStream input = ifs.open(dataPath);
    Scanner scanner = new Scanner(input, "UTF-8");
    DataConverter converter = new DataConverter(dataset);
    
    int id = 0;
    while (scanner.hasNextLine()) {
      if (id % 1000 == 0) {
        log.info("progress : {}", id);
      }
      
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        continue; // skip empty lines
      }
      
      // write the tuple in files[tuple.label]
      Instance instance = converter.convert(line);
      int label = (int) dataset.getLabel(instance);
      files[currents[label]].writeBytes(line);
      files[currents[label]].writeChar('\n');
      
      // update currents
      currents[label]++;

View Full Code Here

  private static Split categoricalSplit(Data data, int attr) {
    double[] values = data.values(attr);
    int[][] counts = new int[values.length][data.getDataset().nblabels()];
    int[] countAll = new int[data.getDataset().nblabels()];


    Dataset dataset = data.getDataset();


    // compute frequencies
    for (int index = 0; index < data.size(); index++) {
      Instance instance = data.get(index);
      counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++;
      countAll[(int) dataset.getLabel(instance)]++;
    }


    int size = data.size();
    double hy = entropy(countAll, size); // H(Y)
    double hyx = 0.0; // H(Y|X)

View Full Code Here

    countAll = new int[data.getDataset().nblabels()];
    countLess = new int[data.getDataset().nblabels()];
  }


  void computeFrequencies(Data data, int attr, double[] values) {
    Dataset dataset = data.getDataset();


    for (int index = 0; index < data.size(); index++) {
      Instance instance = data.get(index);
      counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++;
      countAll[(int) dataset.getLabel(instance)]++;
    }
  }

View Full Code Here

    String descriptor = DescriptorUtils.generateDescriptor(description);


    Path fPath = validateOutput(filePath);


    log.info("generating the dataset...");
    Dataset dataset = generateDataset(descriptor, dataPath, regression);


    log.info("storing the dataset description");
    String json = dataset.toJSON();
    DFUtils.storeString(new Configuration(), fPath, json);
  }

View Full Code Here

   *          attribute names
   */
  public static String toString(String forestPath, String datasetPath, String[] attrNames) throws IOException {
    Configuration conf = new Configuration();
    DecisionForest forest = DecisionForest.load(conf, new Path(forestPath));
    Dataset dataset = Dataset.load(conf, new Path(datasetPath));
    return toString(forest, dataset, attrNames);
  }

View Full Code Here

    private int nblabels;
    
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      Dataset dataset = Builder.loadDataset(conf);
      setup(dataset.nblabels());
    }

View Full Code Here

    Random rng = RandomUtils.getRandom();


    String[] source = Utils.double2String(dData);
    String descriptor = "N N N N N N N N L";


    Dataset dataset = DataLoader.generateDataset(descriptor, false, source);
    Data data = DataLoader.loadData(dataset, source);
    TreeBuilder builder = new DecisionTreeBuilder();
    builder.build(rng, data);


    // regression

View Full Code Here

    classifier.run();


    if (analyze) {
      double[][] results = classifier.getResults();
      if (results != null) {
        Dataset dataset = Dataset.load(getConf(), datasetPath);
        if (dataset.isNumerical(dataset.getLabelId())) {
          RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
          regressionAnalyzer.setInstances(results);
          log.info("{}", regressionAnalyzer);
        } else {
          ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
          for (double[] res : results) {
            analyzer.addInstance(dataset.getLabelString(res[0]),
              new ClassifierResult(dataset.getLabelString(res[1]), 1.0));
          }
          log.info("{}", analyzer);
        }
      }
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of org.apache.mahout.classifier.df.data.Dataset

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.FSDataInputStream

org.apache.mahout.classifier.df.BreimanExample

org.apache.mahout.classifier.df.builder.InfiniteRecursionTest

org.apache.mahout.classifier.df.DecisionForestTest

org.apache.mahout.classifier.df.mapreduce.partial.Step1MapperTest

org.apache.mahout.classifier.df.mapreduce.TestForest

org.apache.mahout.classifier.df.split.DefaultIgSplitTest

org.apache.mahout.classifier.df.split.OptIgSplit

org.apache.mahout.classifier.df.split.RegressionSplitTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.