* @param all
* if true select all categories
*/
public static void runJob(String input, String output, String catFile,
boolean exactMatchOnly, boolean all) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(WikipediaToSequenceFile.class);
if (WikipediaToSequenceFile.log.isInfoEnabled()) {
log.info("Input: " + input + " Out: " + output + " Categories: " + catFile
+ " All Files: " + all);
}
conf.set("xmlinput.start", "<page>");
conf.set("xmlinput.end", "</page>");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setBoolean("exact.match.only", exactMatchOnly);
conf.setBoolean("all.files", all);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClass(WikipediaMapper.class);
conf.setInputFormat(XmlInputFormat.class);
conf.setReducerClass(IdentityReducer.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
/*
* conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
* "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
* "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
*/
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
Set<String> categories = new HashSet<String>();
if (catFile.length() > 0) {
for (String line : new FileLineIterable(new File(catFile))) {
categories.add(line.trim().toLowerCase());
}
}
DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil
.getClass(categories));
String categoriesStr = setStringifier.toString(categories);
conf.set("wikipedia.categories", categoriesStr);
client.setConf(conf);
JobClient.runJob(conf);
}