reporter.incrCounter(LinkCounter.INVALID_URL, 1);
return;
}
arrayList.clear();
arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, null, docno));
keyWord.set(base);
output.collect(keyWord, arrayList);
arrayList.clear();
// keeping track of the number of documents that have actually been
// processed
reporter.incrCounter(LinkCounter.OUTPUT_DOCS, 1);
try {
baseHost = new URI(base).getHost();
} catch (Exception e) {
reporter.incrCounter(LinkCounter.INVALID_URL, 1);
return;
}
if(baseHost == null) {
reporter.incrCounter(LinkCounter.INVALID_URL, 1);
return;
}
try {
// initializing the parser with new content
parser.setInputHTML(doc.getContent());
// Setting base URL for the current document
NodeList nl = parser.parse(null);
BaseHrefTag baseTag = new BaseHrefTag();
baseTag.setBaseUrl(base);
nl.add(baseTag);
// re-initializing the parser with the correct content
parser.setInputHTML(nl.toHtml());
// listing all LinkTag nodes
list = parser.extractAllNodesThatMatch(filter);
} catch (ParserException e) {
reporter.incrCounter(LinkCounter.PARSER_FAILED, 1);
return;
} catch (StackOverflowError e) {
reporter.incrCounter(LinkCounter.PARSER_FAILED, 1);
return;
}
for(int i = 0; i < list.size(); i++) {
LinkTag link = (LinkTag) list.elementAt(i);
String anchor = link.getLinkText();
String url = link.extractLink();
if(url == null) {
continue;
}
if(url.equals(base)) {// discard self links
continue;
}
String host = null;
try {
host = new URI(url).getHost();
} catch (Exception e) {
continue;
}
if(host == null) {
continue;
}
if(anchor == null) {
anchor = "";
}
// normalizing the anchor text
anchor = normalizer.process(anchor);
arrayList.clear();
if(baseHost.equals(host)) {
if(!includeInternalLinks) {
continue;
}
arrayList.add(new AnchorText(
AnchorTextConstants.Type.INTERNAL_IN_LINK.val, anchor, docno));
} else {
arrayList.add(new AnchorText(
AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, anchor, docno));
}
try {
keyWord.set(url);
output.collect(keyWord, arrayList);
} catch (UTFDataFormatException e) {
reporter.incrCounter(LinkCounter.TEXT_TOO_LONG, 1);
keyWord.set(url);
byte flag = arrayList.get(0).getType();
arrayList.clear();
arrayList.add(new AnchorText(flag, AnchorTextConstants.EMPTY_STRING, docno));
output.collect(keyWord, arrayList);
}
}
}