//ignore links to special pages
if (anchorTarget.length() > 0
&& !specialPattern.matcher(anchorTarget).matches()){
WikiLink wl = new WikiLink();
wl.setAnchorTarget( anchorTarget );
links.add(wl);
}
} else if (disambPattern.matcher(rawText).find()) {
isDisambig = true;
matcher = linkPattern.matcher(rawText);
while(matcher.find()) {
String anchorTarget = matcher.group(2).trim().replace(' ', '_');
//ignore links to special pages
if (anchorTarget.length() == 0
|| specialPattern.matcher(anchorTarget).matches()) continue;
WikiLink wl = new WikiLink();
wl.setAnchorTarget( anchorTarget );
links.add(wl);
}
} else {
//get infobox content
this.infobox = getInfobox(rawText);
//remove templates
StringBuilder step0 = removeTemplates(rawText);
//remove image and file links
StringBuilder step05 = removeMediaLinks(step0);
//remove html comments
StringBuffer step1 = new StringBuffer();
matcher = commentsPattern.matcher(step05);
while(matcher.find()) {
matcher.appendReplacement(step1, "");
}
matcher.appendTail(step1);
//remove misc elements (eg, quotes)
StringBuffer step2 = new StringBuffer();
matcher = otherMarkupPattern.matcher(step1);
while(matcher.find()) {
matcher.appendReplacement(step2, "");
}
matcher.appendTail(step2);
//replace html tags by whitespace
StringBuffer step3 = new StringBuffer();
matcher = tagsPattern.matcher(step2);
while(matcher.find()) {
matcher.appendReplacement(step3, " ");
}
matcher.appendTail(step3);
//clean the text
StringBuffer step35 = new StringBuffer();
matcher = cleanPattern.matcher(step3);
while(matcher.find()) {
matcher.appendReplacement(step35, "");
}
matcher.appendTail(step35);
// normalize whites
StringBuffer step4 = new StringBuffer();
matcher = whitesPattern.matcher(step35);
while(matcher.find()) {
matcher.appendReplacement(step4, " ");
}
matcher.appendTail(step4);
//replace urls with a token
StringBuffer step5 = new StringBuffer();
matcher = urlPattern.matcher(step4);
while(matcher.find()) {
matcher.appendReplacement(step5, "URLTOKEN");
}
matcher.appendTail(step5);
// Extract and Clean Links / Categories
this.cleanText = new StringBuilder(step5);
matcher = linkPattern.matcher(step5);
int offset = 0;
while(matcher.find()) {
String anchorTarget = matcher.group(2).trim();
if (anchorTarget.length() == 0) continue;
String anchorText = matcher.group(4);
if (anchorText == null) {
anchorText = anchorTarget;
} else {
Matcher whitesMatcher = whitesPattern.matcher(anchorText);
StringBuffer sb = new StringBuffer();
while(whitesMatcher.find()) {
whitesMatcher.appendReplacement(sb, " ");
}
whitesMatcher.appendTail(sb);
anchorText = sb.toString().trim();
}
Matcher specialMatcher = null;
if ((specialMatcher = categoryPattern.matcher(anchorTarget)).matches()) {
anchorText = specialMatcher.group(1);
categories.add(anchorText);
} else if ((specialMatcher = specialPattern.matcher(anchorTarget)).matches()) {
anchorText = "";
} else {
WikiLink wl = new WikiLink();
wl.setStart( matcher.start() - offset );
wl.setLength( anchorText.length() );
wl.setAnchorTarget( anchorTarget );
wl.setAnchorText( anchorText );
links.add(wl);
}
cleanText.delete(matcher.start() - offset, matcher.end() - offset);