public void map(Text key, Content content,
OutputCollector<Text, BehemothDocument> output, Reporter reporter)
throws IOException {
BehemothDocument behemothDocument = new BehemothDocument();
int status = Integer.parseInt(content.getMetadata().get(
Nutch.FETCH_STATUS_KEY));
if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
// content not fetched successfully, skip document
LOG.debug("Skipping " + key
+ " as content is not fetched successfully");
return;
}
// TODO store the fetch metadata in the Behemoth document
// store the binary content and mimetype in the Behemoth document
String contentType = content.getContentType();
byte[] binarycontent = content.getContent();
behemothDocument.setUrl(key.toString());
behemothDocument.setContent(binarycontent);
behemothDocument.setContentType(contentType);
output.collect(key, behemothDocument);
}