// foreach tuple of the data
Path dataPath = new Path(dataStr);
FileSystem ifs = dataPath.getFileSystem(conf);
FSDataInputStream input = ifs.open(dataPath);
Scanner scanner = new Scanner(input, "UTF-8");
DataConverter converter = new DataConverter(dataset);
int id = 0;
while (scanner.hasNextLine()) {
if (id % 1000 == 0) {
log.info("progress : {}", id);
}
String line = scanner.nextLine();
if (line.isEmpty()) {
continue; // skip empty lines
}
// write the tuple in files[tuple.label]
Instance instance = converter.convert(line);
int label = (int) dataset.getLabel(instance);
files[currents[label]].writeBytes(line);
files[currents[label]].writeChar('\n');
// update currents