// Switch to this if you'd like to look at all text files. May take many minutes just to read the file listing.
//String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/textData-*";
// Creates a new job configuration for this Hadoop job.
JobConf job = new JobConf(this.getConf());
job.setJarByClass(TotalAnalysis.class);
// fix from the google groups discussion
String segmentListFile = "s3n://aws-publicdatasets/common-crawl/parse-output/valid_segments.txt";
FileSystem fsInput = FileSystem.get(new URI(segmentListFile), job);
BufferedReader reader = new BufferedReader(new InputStreamReader(fsInput.open(new Path(segmentListFile))));
String segmentId;
while ((segmentId = reader.readLine()) != null) {
String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/"+segmentId+"/textData-*";
FileInputFormat.addInputPath(job, new Path(inputPath));
}
// Read in any additional config parameters.
if (configFile != null) {
LOG.info("adding config parameters from '"+ configFile + "'");
this.getConf().addResource(configFile);
}
// Scan the provided input path for ARC files.
//LOG.info("setting input path to '"+ inputPath + "'");
//FileInputFormat.addInputPath(job, new Path(inputPath));
//FileInputFormat.setInputPathFilter(job, SampleFilter.class);
// Delete the output path directory if it already exists.
LOG.info("clearing the output path at '" + outputPath + "'");
FileSystem fs = FileSystem.get(new URI(outputPath), job);
if (fs.exists(new Path(outputPath)))
fs.delete(new Path(outputPath), true);
// Set the path where final output 'part' files will be saved.
LOG.info("setting output path to '" + outputPath + "'");
FileOutputFormat.setOutputPath(job, new Path(outputPath));
FileOutputFormat.setCompressOutput(job, false);
// Set which InputFormat class to use.
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
// Set which OutputFormat class to use.
job.setOutputFormat(TextOutputFormat.class);
// Set the output data types.
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// Set which Mapper and Reducer classes to use.
job.setMapperClass(TotalAnalysis.TotalAnalysisMapper.class);
job.setReducerClass(TotalAnalysis.TotalAnalysisReducer.class);
if (JobClient.runJob(job).isSuccessful())
return 0;
else
return 1;