partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
files[p] = pfs.create(partPaths[p]);
}
Path datasetPath = new Path(datasetStr);
Dataset dataset = Dataset.load(conf, datasetPath);
// currents[label] = next partition file where to place the tuple
int[] currents = new int[dataset.nblabels()];
// currents is initialized randomly in the range [0, numpartitions[
Random random = RandomUtils.getRandom();
for (int c = 0; c < currents.length; c++) {
currents[c] = random.nextInt(numPartitions);
}
// foreach tuple of the data
Path dataPath = new Path(dataStr);
FileSystem ifs = dataPath.getFileSystem(conf);
FSDataInputStream input = ifs.open(dataPath);
Scanner scanner = new Scanner(input, "UTF-8");
DataConverter converter = new DataConverter(dataset);
int id = 0;
while (scanner.hasNextLine()) {
if (id % 1000 == 0) {
log.info("progress : {}", id);
}
String line = scanner.nextLine();
if (line.isEmpty()) {
continue; // skip empty lines
}
// write the tuple in files[tuple.label]
Instance instance = converter.convert(line);
int label = (int) dataset.getLabel(instance);
files[currents[label]].writeBytes(line);
files[currents[label]].writeChar('\n');
// update currents
currents[label]++;