Package org.apache.mahout.classifier.df.data

Examples of org.apache.mahout.classifier.df.data.Dataset


    private int nblabels;
   
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      Dataset dataset = Builder.loadDataset(conf);
      setup(dataset.nblabels());
    }
View Full Code Here


    // prepare the data
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, false, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);
    Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
    String[][] splits = Utils.splitData(sData, NUM_MAPPERS);

    MockTreeBuilder treeBuilder = new MockTreeBuilder();

    LongWritable key = new LongWritable();
View Full Code Here

      partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
      files[p] = pfs.create(partPaths[p]);
    }
   
    Path datasetPath = new Path(datasetStr);
    Dataset dataset = Dataset.load(conf, datasetPath);
   
    // currents[label] = next partition file where to place the tuple
    int[] currents = new int[dataset.nblabels()];
   
    // currents is initialized randomly in the range [0, numpartitions[
    Random random = RandomUtils.getRandom();
    for (int c = 0; c < currents.length; c++) {
      currents[c] = random.nextInt(numPartitions);
    }
   
    // foreach tuple of the data
    Path dataPath = new Path(dataStr);
    FileSystem ifs = dataPath.getFileSystem(conf);
    FSDataInputStream input = ifs.open(dataPath);
    Scanner scanner = new Scanner(input, "UTF-8");
    DataConverter converter = new DataConverter(dataset);
   
    int id = 0;
    while (scanner.hasNextLine()) {
      if (id % 1000 == 0) {
        log.info("progress : {}", id);
      }
     
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        continue; // skip empty lines
      }
     
      // write the tuple in files[tuple.label]
      Instance instance = converter.convert(line);
      int label = (int) dataset.getLabel(instance);
      files[currents[label]].writeBytes(line);
      files[currents[label]].writeChar('\n');
     
      // update currents
      currents[label]++;
View Full Code Here

  private static Split categoricalSplit(Data data, int attr) {
    double[] values = data.values(attr);
    int[][] counts = new int[values.length][data.getDataset().nblabels()];
    int[] countAll = new int[data.getDataset().nblabels()];

    Dataset dataset = data.getDataset();

    // compute frequencies
    for (int index = 0; index < data.size(); index++) {
      Instance instance = data.get(index);
      counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++;
      countAll[(int) dataset.getLabel(instance)]++;
    }

    int size = data.size();
    double hy = entropy(countAll, size); // H(Y)
    double hyx = 0.0; // H(Y|X)
View Full Code Here

    countAll = new int[data.getDataset().nblabels()];
    countLess = new int[data.getDataset().nblabels()];
  }

  void computeFrequencies(Data data, int attr, double[] values) {
    Dataset dataset = data.getDataset();

    for (int index = 0; index < data.size(); index++) {
      Instance instance = data.get(index);
      counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++;
      countAll[(int) dataset.getLabel(instance)]++;
    }
  }
View Full Code Here

    String descriptor = DescriptorUtils.generateDescriptor(description);

    Path fPath = validateOutput(filePath);

    log.info("generating the dataset...");
    Dataset dataset = generateDataset(descriptor, dataPath, regression);

    log.info("storing the dataset description");
    String json = dataset.toJSON();
    DFUtils.storeString(new Configuration(), fPath, json);
  }
View Full Code Here

   *          attribute names
   */
  public static String toString(String forestPath, String datasetPath, String[] attrNames) throws IOException {
    Configuration conf = new Configuration();
    DecisionForest forest = DecisionForest.load(conf, new Path(forestPath));
    Dataset dataset = Dataset.load(conf, new Path(datasetPath));
    return toString(forest, dataset, attrNames);
  }
View Full Code Here

    private int nblabels;
   
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      Dataset dataset = Builder.loadDataset(conf);
      setup(dataset.nblabels());
    }
View Full Code Here

    Random rng = RandomUtils.getRandom();

    String[] source = Utils.double2String(dData);
    String descriptor = "N N N N N N N N L";

    Dataset dataset = DataLoader.generateDataset(descriptor, false, source);
    Data data = DataLoader.loadData(dataset, source);
    TreeBuilder builder = new DecisionTreeBuilder();
    builder.build(rng, data);

    // regression
View Full Code Here

    classifier.run();

    if (analyze) {
      double[][] results = classifier.getResults();
      if (results != null) {
        Dataset dataset = Dataset.load(getConf(), datasetPath);
        if (dataset.isNumerical(dataset.getLabelId())) {
          RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
          regressionAnalyzer.setInstances(results);
          log.info("{}", regressionAnalyzer);
        } else {
          ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
          for (double[] res : results) {
            analyzer.addInstance(dataset.getLabelString(res[0]),
              new ClassifierResult(dataset.getLabelString(res[1]), 1.0));
          }
          log.info("{}", analyzer);
        }
      }
    }
View Full Code Here

TOP

Related Classes of org.apache.mahout.classifier.df.data.Dataset

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.