int chunkSizeInMegabytes,
Writable value,
int[] maxTermDimension) throws IOException {
List<Path> chunkPaths = new ArrayList<Path>();
Writable key = new Text();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);
FileStatus[] outputFiles = fs.globStatus(new Path(wordCountPath.toString()
+ OUTPUT_FILES_PATTERN));
long chunkSizeLimit = chunkSizeInMegabytes * 1024 * 1024;
int chunkIndex = 0;
Path chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE,
chunkIndex);
chunkPaths.add(chunkPath);
SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
IntWritable.class);
long currentChunkSize = 0;
int i = 0;
for (FileStatus fileStatus : outputFiles) {
Path path = fileStatus.getPath();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
// key is feature value is count
while (reader.next(key, value)) {
if (currentChunkSize > chunkSizeLimit) {
dictWriter.close();
chunkIndex++;
chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE,
chunkIndex);
chunkPaths.add(chunkPath);
dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
currentChunkSize = 0;
}
int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2
+ Integer.SIZE / 8;
currentChunkSize += fieldSize;
dictWriter.append(key, new IntWritable(i++));
}
}