Package org.apache.mahout.df.data

Examples of org.apache.mahout.df.data.Dataset


    log.info("DistributedCache.getCacheFiles(): {}", ArrayUtils.toString(files));
   
    Preconditions.checkArgument(files != null && files.length >= 2, "missing paths from the DistributedCache" );
   
    Path datasetPath = new Path(files[0].getPath());
    Dataset dataset = Dataset.load(conf, datasetPath);
   
    int numMaps = Builder.getNumMaps(conf);
    int p = conf.getInt("mapred.task.partition", -1);
   
    // total number of trees in the forest
View Full Code Here


    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);
View Full Code Here

    String descriptor = DescriptorUtils.generateDescriptor(description);
   
    Path fPath = validateOutput(filePath);
   
    log.info("generating the dataset...");
    Dataset dataset = generateDataset(descriptor, dataPath);
   
    log.info("storing the dataset description");
    DFUtils.storeWritable(new Configuration(), fPath, dataset);
  }
View Full Code Here

   
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
     
      Dataset dataset = Builder.loadDataset(conf);
      setup(dataset);
    }
View Full Code Here

    private int nblabels;
   
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      Dataset dataset = Builder.loadDataset(conf);
      setup(dataset.nblabels());
    }
View Full Code Here

      if ((files == null) || (files.length < 2)) {
        throw new IOException("not enough paths in the DistributedCache");
      }

      Dataset dataset = Dataset.load(conf, new Path(files[0].getPath()));

      converter = new DataConverter(dataset);

      forest = DecisionForest.load(conf, new Path(files[1].getPath()));
      if (forest == null) {
View Full Code Here

      partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
      files[p] = pfs.create(partPaths[p]);
    }
   
    Path datasetPath = new Path(datasetStr);
    Dataset dataset = Dataset.load(conf, datasetPath);
   
    // currents[label] = next partition file where to place the tuple
    int[] currents = new int[dataset.nblabels()];
   
    // currents is initialized randomly in the range [0, numpartitions[
    Random random = RandomUtils.getRandom();
    for (int c = 0; c < currents.length; c++) {
      currents[c] = random.nextInt(numPartitions);
    }
   
    // foreach tuple of the data
    Path dataPath = new Path(dataStr);
    FileSystem ifs = dataPath.getFileSystem(conf);
    FSDataInputStream input = ifs.open(dataPath);
    Scanner scanner = new Scanner(input);
    DataConverter converter = new DataConverter(dataset);
    int nbInstances = dataset.nbInstances();
   
    int id = 0;
    while (scanner.hasNextLine()) {
      if (id % 1000 == 0) {
        log.info("progress : {} / {}", id, nbInstances);
View Full Code Here

    }

    // store the data into a file
    String[] sData = Utils.double2String(source);
    Path dataPath = Utils.writeDataToTestFile(sData);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    Data data = DataLoader.loadData(dataset, sData);

    Configuration conf = new Configuration();
    Step0JobTest.setMaxSplitSize(conf, dataPath, NUM_MAPS);
View Full Code Here

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);
View Full Code Here

    // prepare the data
    String descriptor = Utils.randomDescriptor(rng, nbAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, nbInstances);
    String[] sData = Utils.double2String(source);
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    String[][] splits = Utils.splitData(sData, nbMappers);

    // prepare first step output
    TreeID[] keys = new TreeID[nbTrees];
    Node[] trees = new Node[nbTrees];
View Full Code Here

TOP

Related Classes of org.apache.mahout.df.data.Dataset

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.