delete(outPath);
// We only need to execute a Map-only job for this task.
// Every map will process a HTML file and extract the reviews from it.
MapOnlyJobBuilder builder = new MapOnlyJobBuilder(conf);
builder.addInput(new Path(inputFolder), new HadoopInputFormat(TextInputFormat.class),
new MapOnlyMapper<LongWritable, Text, Text, BSONObject>() {
StringBuffer inMemoryHtml = new StringBuffer();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
// for every line in the HTML just add it to a string buffer
// we will process the entire HTML in the end (cleanup())
inMemoryHtml.append(value.toString());
}
@Override
protected void cleanup(Context context, MultipleOutputsCollector coll) throws IOException, InterruptedException {
String html = inMemoryHtml.toString();
Matcher startMatcher = startPattern.matcher(html);
Matcher endMatcher = endPattern.matcher(html);
Text documentId = new Text();
Matcher placeMatcher = placePattern.matcher(html);
// we assume this will always match - otherwise fail fast!
placeMatcher.find();
String placeId = placeMatcher.group(1);
// Now we will proceed as follows:
// We create a regex matcher for start of reviews and end of reviews
// Within each (start, end) pair, we will execute an arbitrary number of matchers
// for matching all the other properties (username, date, rating, review text...).
// finally we add all the properties to a Mongo BSONObject that can be used as output.
while(startMatcher.find()) {
BSONObject review = new BasicBSONObject();
review.put("place_id", placeId);
int reviewStart = startMatcher.start();
endMatcher.find();
int reviewEnd = endMatcher.start();
// Focus only on (start, end) text for this review
String reviewText = html.substring(reviewStart, reviewEnd);
for(Map.Entry<String, Pattern> parsingProperty : parsingConfig.entrySet()) {
Matcher matcher = parsingProperty.getValue().matcher(reviewText);
if(matcher.find()) {
review.put(parsingProperty.getKey(), matcher.group(1).trim());
}
}
// The Mongo documentId of the review will the be the Review_id.
documentId.set((String) review.get("review_id"));
// Write the pair (Id, document) to the output collector.
context.write(documentId, review);
}
}
});
// --- This is the most important part (what makes it work with MongoDB: ---
// Set the URL of the MongoDB we will write to. Here we specify the DB and the final Table.
MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/test.qype");
// Set the output format to HadoopOutputFormat(MongoOutputFormat.class)
// The key will be the documentIds for the Mongo table and the value a Mongo BSONObject with all the properties we wish.
builder.setOutput(new Path(outPath), new HadoopOutputFormat(MongoOutputFormat.class), Text.class,
BSONObject.class);
// Finally, build and execute the Pangool Job.
try {
builder.createJob().waitForCompletion(true);
} finally {
builder.cleanUpInstanceFiles();
}
// we are not interested in the output folder, so delete it
delete(outPath);