Package org.apache.mahout.classifier.df.data

Examples of org.apache.mahout.classifier.df.data.Dataset


    String descriptor = DescriptorUtils.generateDescriptor(description);

    Path fPath = validateOutput(filePath);

    log.info("generating the dataset...");
    Dataset dataset = generateDataset(descriptor, dataPath, regression);

    log.info("storing the dataset description");
    DFUtils.storeWritable(new Configuration(), fPath, dataset);
  }
View Full Code Here


   */
  public static String toString(String forestPath, String datasetPath, String[] attrNames)
    throws Exception {
    Configuration conf = new Configuration();
    DecisionForest forest = DecisionForest.load(conf, new Path(forestPath));
    Dataset dataset = Dataset.load(conf, new Path(datasetPath));
    return toString(forest, dataset, attrNames);
  }
View Full Code Here

      partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
      files[p] = pfs.create(partPaths[p]);
    }
   
    Path datasetPath = new Path(datasetStr);
    Dataset dataset = Dataset.load(conf, datasetPath);
   
    // currents[label] = next partition file where to place the tuple
    int[] currents = new int[dataset.nblabels()];
   
    // currents is initialized randomly in the range [0, numpartitions[
    Random random = RandomUtils.getRandom();
    for (int c = 0; c < currents.length; c++) {
      currents[c] = random.nextInt(numPartitions);
    }
   
    // foreach tuple of the data
    Path dataPath = new Path(dataStr);
    FileSystem ifs = dataPath.getFileSystem(conf);
    FSDataInputStream input = ifs.open(dataPath);
    Scanner scanner = new Scanner(input);
    DataConverter converter = new DataConverter(dataset);
    int nbInstances = dataset.nbInstances();
   
    int id = 0;
    while (scanner.hasNextLine()) {
      if (id % 1000 == 0) {
        log.info("progress : {} / {}", id, nbInstances);
      }
     
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        continue; // skip empty lines
      }
     
      // write the tuple in files[tuple.label]
      Instance instance = converter.convert(line);
      int label = (int) dataset.getLabel(instance);
      files[currents[label]].writeBytes(line);
      files[currents[label]].writeChar('\n');
     
      // update currents
      currents[label]++;
View Full Code Here

      return -1;
    }
   
    // load the data
    FileSystem fs = dataPath.getFileSystem(new Configuration());
    Dataset dataset = Dataset.load(getConf(), datasetPath);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
   
    // take m to be the first integer less than log2(M) + 1, where M is the
    // number of inputs
    int m = (int) Math.floor(Maths.log(2, data.getDataset().nbAttributes()) + 1);
View Full Code Here

    super.setUp();
   
    rng = RandomUtils.getRandom();
   
    // Dataset
    Dataset dataset = DataLoader.generateDataset("C N N C L", false, TRAIN_DATA);
   
    // Training data
    data = DataLoader.loadData(dataset, TRAIN_DATA);
   
    // Test data
View Full Code Here

    classifier.run();

    if (analyze) {
      double[][] results = classifier.getResults();
      if (results != null) {
        Dataset dataset = Dataset.load(getConf(), datasetPath);
        if (dataset.isNumerical(dataset.getLabelId())) {
          RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
          regressionAnalyzer.setInstances(results);
          log.info("{}", regressionAnalyzer);
        } else {
          ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
          for (double[] res : results) {
            analyzer.addInstance(dataset.getLabelString(res[0]),
              new ClassifierResult(dataset.getLabelString(res[1]), 1.0));
          }
          log.info("{}", analyzer);
        }
      }
    }
View Full Code Here

      log.error("No Decision Forest found!");
      return;
    }

    // load the dataset
    Dataset dataset = Dataset.load(getConf(), datasetPath);
    DataConverter converter = new DataConverter(dataset);

    log.info("Sequential classification...");
    long time = System.currentTimeMillis();

    Random rng = RandomUtils.getRandom();

    List<double[]> resList = new ArrayList<double[]>();
    if (dataFS.getFileStatus(dataPath).isDir()) {
      //the input is a directory of files
      testDirectory(outputPath, converter, forest, dataset, resList, rng);
    else {
      // the input is one single file
      testFile(dataPath, outputPath, converter, forest, dataset, resList, rng);
    }

    time = System.currentTimeMillis() - time;
    log.info("Classification Time: {}", DFUtils.elapsedTime(time));

    if (analyze) {
      if (dataset.isNumerical(dataset.getLabelId())) {
        RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
        double[][] results = new double[resList.size()][2];
        regressionAnalyzer.setInstances(resList.toArray(results));
        log.info("{}", regressionAnalyzer);
      } else {
        ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
        for (double[] r : resList) {
          analyzer.addInstance(dataset.getLabelString(r[0]),
            new ClassifierResult(dataset.getLabelString(r[1]), 1.0));
        }
        log.info("{}", analyzer);
      }
    }
  }
View Full Code Here

    // prepare the data
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, false, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);
    Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
    String[][] splits = Utils.splitData(sData, NUM_MAPPERS);

    MockTreeBuilder treeBuilder = new MockTreeBuilder();

    LongWritable key = new LongWritable();
View Full Code Here

    super.setUp();
   
    rng = RandomUtils.getRandom(1);
   
    // Dataset
    Dataset dataset = DataLoader
        .generateDataset("C N N C L", false, TRAIN_DATA);
   
    // Training data
    data = DataLoader.loadData(dataset, TRAIN_DATA);
   
View Full Code Here

    rng = RandomUtils.getRandom();
  }

  private static Data[] generateTrainingDataA() throws DescriptorException {
    // Dataset
    Dataset dataset = DataLoader.generateDataset("C N N C L", false, TRAIN_DATA);
   
    // Training data
    Data data = DataLoader.loadData(dataset, TRAIN_DATA);
    @SuppressWarnings("unchecked")
    List<Instance>[] instances = new List[3];
View Full Code Here

TOP

Related Classes of org.apache.mahout.classifier.df.data.Dataset

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.