LOG.info("fetching: " + url);
}
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
WebPage page = new WebPage();
ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
if(!protocolOutput.getStatus().isSuccess()) {
LOG.error("Fetch failed with protocol status: "
+ ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+ ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
return (-1);
}
Content content = protocolOutput.getContent();
if (content == null) {
LOG.error("No content for " + url);
return (-1);
}
page.setBaseUrl(new org.apache.avro.util.Utf8(url));
page.setContent(ByteBuffer.wrap(content.getContent()));
if (force) {
content.setContentType(contentType);
} else {
contentType = content.getContentType();
}
if (contentType == null) {
LOG.error("Failed to determine content type!");
return (-1);
}
page.setContentType(new Utf8(contentType));
if (ParserJob.isTruncated(url, page)) {
LOG.warn("Content is truncated, parse may fail!");
}
Parse parse = new ParseUtil(conf).parse(url, page);
if (parse == null) {
LOG.error("Problem with parse - check log");
return (-1);
}
// Calculate the signature
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
LOG.info("signature: " + StringUtil.toHexString(signature));
}
LOG.info("---------\nUrl\n---------------\n");
System.out.print(url + "\n");
LOG.info("---------\nMetadata\n---------\n");
Map<Utf8, ByteBuffer> metadata = page.getMetadata();
StringBuffer sb = new StringBuffer();
if (metadata != null) {
Iterator<Entry<Utf8, ByteBuffer>> iterator = metadata.entrySet()
.iterator();
while (iterator.hasNext()) {