/**
* Run the job
*/
public void build() {
try {
JobConf conf = new JobConf(config);
conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
conf.set("stores.xml",
new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
conf.setBoolean("save.keys", saveKeys);
conf.setBoolean("reducer.per.bucket", reducerPerBucket);
if(!isAvro) {
conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
conf.setMapperClass(mapperClass);
conf.setMapOutputKeyClass(BytesWritable.class);
conf.setMapOutputValueClass(BytesWritable.class);
if(reducerPerBucket) {
conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
} else {
conf.setReducerClass(HadoopStoreBuilderReducer.class);
}
}
conf.setInputFormat(inputFormatClass);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setOutputKeyClass(BytesWritable.class);
conf.setOutputValueClass(BytesWritable.class);
conf.setJarByClass(getClass());
conf.setReduceSpeculativeExecution(false);
FileInputFormat.setInputPaths(conf, inputPath);
conf.set("final.output.dir", outputDir.toString());
conf.set("checksum.type", CheckSum.toString(checkSumType));
FileOutputFormat.setOutputPath(conf, tempDir);
FileSystem outputFs = outputDir.getFileSystem(conf);
if(outputFs.exists(outputDir)) {
throw new IOException("Final output directory already exists.");
}
// delete output dir if it already exists
FileSystem tempFs = tempDir.getFileSystem(conf);
tempFs.delete(tempDir, true);
long size = sizeOfPath(tempFs, inputPath);
logger.info("Data size = " + size + ", replication factor = "
+ storeDef.getReplicationFactor() + ", numNodes = "
+ cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);
// Derive "rough" number of chunks and reducers
int numReducers;
if(saveKeys) {
if(this.numChunks == -1) {
this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
/ cluster.getNumberOfPartitions()
/ storeDef.getReplicationFactor() / chunkSizeBytes),
1);
} else {
logger.info("Overriding chunk size byte and taking num chunks ("
+ this.numChunks + ") directly");
}
if(reducerPerBucket) {
numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
} else {
numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor()
* numChunks;
}
} else {
if(this.numChunks == -1) {
this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
/ cluster.getNumberOfPartitions() / chunkSizeBytes),
1);
} else {
logger.info("Overriding chunk size byte and taking num chunks ("
+ this.numChunks + ") directly");
}
if(reducerPerBucket) {
numReducers = cluster.getNumberOfPartitions();
} else {
numReducers = cluster.getNumberOfPartitions() * numChunks;
}
}
conf.setInt("num.chunks", numChunks);
conf.setNumReduceTasks(numReducers);
if(isAvro) {
conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
// conf.setMapperClass(mapperClass);
conf.setMapOutputKeyClass(ByteBuffer.class);
conf.setMapOutputValueClass(ByteBuffer.class);
conf.setInputFormat(inputFormatClass);
conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
conf.setOutputKeyClass(ByteBuffer.class);
conf.setOutputValueClass(ByteBuffer.class);
// AvroJob confs for the avro mapper
AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));
AvroJob.setOutputSchema(conf,
Pair.getPairSchema(Schema.create(Schema.Type.BYTES),
Schema.create(Schema.Type.BYTES)));
AvroJob.setMapperClass(conf, mapperClass);
if(reducerPerBucket) {
conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
} else {
conf.setReducerClass(AvroStoreBuilderReducer.class);
}
}
logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers