partPaths[p] = new Path(partsPath, String.format("part.%03d", p));
files[p] = pfs.create(partPaths[p]);
}
Path datasetPath = new Path(datasetStr);
Dataset dataset = Dataset.load(conf, datasetPath);
// currents[label] = next partition file where to place the tuple
int[] currents = new int[dataset.nblabels()];
// currents is initialized randomly in the range [0, numpartitions[
Random random = RandomUtils.getRandom();
for (int c = 0; c < currents.length; c++) {
currents[c] = random.nextInt(numPartitions);
}
// foreach tuple of the data
Path dataPath = new Path(dataStr);
FileSystem ifs = dataPath.getFileSystem(conf);
FSDataInputStream input = ifs.open(dataPath);
Scanner scanner = new Scanner(input);
DataConverter converter = new DataConverter(dataset);
int nbInstances = dataset.nbInstances();
int id = 0;
while (scanner.hasNextLine()) {
if (id % 1000 == 0) {
log.info(String.format("progress : %d / %d", id, nbInstances));