datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
if (content != null) {
Metadata metadata = content.getMetadata();
// add segment to metadata
metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
}
catch (Exception e) {
if (LOG.isWarnEnabled()) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
try {
// parse the content
parseResult = this.parseUtil.parse(content);
}
catch (Exception e) {
LOG.warn("Error parsing: " + key + ": "
+ StringUtils.stringifyException(e));
}
// set the content signature
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
content, new ParseStatus().getEmptyParse(getConf()));
datum.setSignature(signature);
}
try {
output.collect(key, new NutchWritable(datum));
output.collect(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry <Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
if (!parseStatus.isSuccess()) {
LOG.warn("Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(getConf());
}
// Calculate page signature.
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
content, parse);
// Ensure segment name and score are in parseData metadata
parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
segmentName);
parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
StringUtil.toHexString(signature));
// Pass fetch time to content meta
parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
}
catch (Exception e) {
if (LOG.isWarnEnabled()) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
parse.getText()), parse.getData(), parse.isCanonical())));
}
}
}
catch (IOException e) {
if (LOG.isErrorEnabled()) {
LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
}
}
// return parse status if it exits
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
return p.getData().getStatus();
}
}
}