String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
if (sig != null) {
byte[] signature = StringUtil.fromHexString(sig);
if (signature != null) {
// append a CrawlDatum with a signature
CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
d.setSignature(signature);
crawlOut.append(key, d);
}
}
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
if (ignoreExternalLinks) {
try {
fromHost = new URL(fromUrl).getHost().toLowerCase();
} catch (MalformedURLException e) {
fromHost = null;
}
} else {
fromHost = null;
}
String[] toUrls = new String[links.length];
int validCount = 0;
for (int i = 0; i < links.length; i++) {
String toUrl = links[i].getToUrl();
try {
toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); // normalize the url
toUrl = filters.filter(toUrl); // filter the url
} catch (Exception e) {
toUrl = null;
}
// ignore links to self (or anchors within the page)
if (fromUrl.equals(toUrl)) toUrl = null;
if (toUrl != null) validCount++;
toUrls[i] = toUrl;
}
CrawlDatum adjust = null;
// compute score contributions and adjustment to the original score
for (int i = 0; i < toUrls.length; i++) {
if (toUrls[i] == null) continue;
if (ignoreExternalLinks) {
try {
toHost = new URL(toUrls[i]).getHost().toLowerCase();
} catch (MalformedURLException e) {
toHost = null;
}
if (toHost == null || !toHost.equals(fromHost)) { // external links
continue; // skip it
}
}
CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
Text targetUrl = new Text(toUrls[i]);
adjust = null;
try {
adjust = scfilters.distributeScoreToOutlink((Text)key, targetUrl,
parseData, target, null, links.length, validCount);