private static Pair<Long[],List<Path>> createDictionaryChunks(Path featureCountPath,
String dictionaryPathBase,
int chunkSizeInMegabytes) throws IOException {
List<Path> chunkPaths = new ArrayList<Path>();
IntWritable key = new IntWritable();
LongWritable value = new LongWritable();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath.toString()
+ OUTPUT_FILES_PATTERN));
long chunkSizeLimit = chunkSizeInMegabytes * 1024 * 1024;
int chunkIndex = 0;
Path chunkPath = getPath(dictionaryPathBase + FREQUENCY_FILE, chunkIndex);
chunkPaths.add(chunkPath);
SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
LongWritable.class);
long currentChunkSize = 0;
long featureCount = 0;
long vectorCount = Long.MAX_VALUE;
for (FileStatus fileStatus : outputFiles) {
Path path = fileStatus.getPath();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
// key is feature value is count
while (reader.next(key, value)) {
if (currentChunkSize > chunkSizeLimit) {
freqWriter.close();
chunkIndex++;
chunkPath = getPath(dictionaryPathBase + FREQUENCY_FILE, chunkIndex);
chunkPaths.add(chunkPath);
freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
currentChunkSize = 0;
}
int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
currentChunkSize += fieldSize;
if (key.get() >= 0) {
freqWriter.append(key, value);
} else if (key.get() == -1) {
vectorCount = value.get();
}
featureCount = Math.max(key.get(), featureCount);
}
}
featureCount++;
freqWriter.close();