package dmir.wikipedia.cleaners;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import dmir.wikipedia.WikiLink;
import dmir.wikipedia.addons.EnglishWikiAreaUtil;
import dmir.wikipedia.addons.EnglishWikiCoordinatesUtil;
import dmir.wikipedia.addons.EnglishWikiPopulationUtil;
public abstract class AbstractWikiPageCleaner {
protected static final String WIKIPEDIA_LANGUAGE_PREFIXES =
"|(aa:)|(ab:)|(af:)|(am:)|(an:)|(ar:)|(as:)|(av:)|(ay:)|(az:)" +
"|(ba:)|(be:)|(bg:)|(bh:)|(bi:)|(bm:)|(bn:)|(bg:)|(bo:)|(br:)|(bs:)" +
"|(ca:)|(ce:)|(ch:)|(co:)|(cr:)|(cs:)|(cv:)|(cy:)|(cu:)" +
"|(da:)|(de:)|(dk:)|(dv:)|(dz:)" +
"|(ee:)|(el:)|(eo:)|(es:)|(et:)|(eu:)" + // "|(en:)" +
"|(fa:)|(ff:)|(fi:)|(fj:)|(fo:)|(fr:)|(fy:)" +
"|(ga:)|(gd:)|(gl:)|(gn:)|(gu:)|(gv:)" +
"|(ha:)|(he:)|(hi:)|(hr:)|(ht:)|(hu:)|(hy:)" +
"|(ia:)|(id:)|(ie:)|(ig:)|(ii:)|(ik:)|(io:)|(is:)|(it:)|(iu:)" +
"|(ja:)|(jv:)" +
"|(ka:)|(kg:)|(ki:)|(kk:)|(kl:)|(km:)|(kn:)|(ko:)|(ks:)|(ku:)|(kv:)|(kw:)|(ky:)" +
"|(la:)|(lb:)|(li:)|(ln:)|(lo:)|(lt:)|(lv:)" +
"|(mg:)|(mh:)|(mi:)|(mk:)|(ml:)|(mn:)|(mo:)|(mr:)|(ms:)|(mt:)|(my:)" +
"|(na:)|(nb:)|(ne:)|(ng:)|(nl:)|(nn:)|(no:)|(nv:)|(ny:)" +
"|(oc:)|(or:)|(os:)" +
"|(pa:)|(pl:)|(ps:)|(pt:)" +
"|(qu:)" +
"|(rm:)|(rn:)|(ro:)|(ru:)|(rw:)" +
"|(sa:)|(sc:)|(sd:)|(se:)|(sg:)|(sh:)|(si:)|(sk:)|(sl:)|(sm:)|(sn:)|(so:)|(sq:)|(sr:)|(ss:)|(st:)|(su:)|(sv:)|(sw:)" +
"|(ta:)|(te:)|(tg:)|(th:)|(ti:)|(tk:)|(tl:)|(tn:)|(to:)|(tr:)|(ts:)|(tt:)|(tw:)|(ty:)" +
"|(ug:)|(uk:)|(ur:)|(uz:)" +
"|(ve:)|(vi:)|(vo:)" +
"|(wa:)|(wo:)" +
"|(xh:)" +
"|(yi:)|(yo:)" +
"|(za:)|(zh:)|(zu:)" +
"|(ace:)|(als:)|(ang:)|(arc:)|(ast:)|(arz:)" +
"|(bar:)|(bat-smg:)|(bcl:)|(bjn:)|(bpy:)|(be-x-old:)|(bug:)" +
"|(cdo:)|(ceb:)|(chr:)|(chy:)|(ckb:)|(crh:)|(csb:)" +
"|(diq:)|(dsb:)" +
"|(eml:)|(ext:)" +
"|(fiu-vro:)|(frp:)|(frr:)|(fur:)" +
"|(gan:)|(gag:)|(got:)" +
"|(haw:)|(hif:)|(hak:)|(hsb:)" +
"|(ilo:)" +
"|(jbo:)" +
"|(kaa:)|(kab:)|(kbd:)|(koi:)|(krc:)|(ksh:)" +
"|(lad:)|(lbe:)|(lij:)|(lmo:)|(ltg:)" +
"|(map-bms:)|(mdf:)|(mhr:)|(mrj:)|(mwl:)|(mzn:)" +
"|(nds-nl:)" +
"|(nah:)|(nap:)|(nds:)|(new:)|(nov:)|(nrm:)|(nso:)" +
"|(pag:)|(pam:)|(pap:)|(pcd:)|(pdc:)|(pfl:)|(pih:)|(pms:)|(pnb:)|(pnt:)" +
"|(roa-rup:)|(roa-tara:)|(rmy:)|(rue:)" +
"|(scn:)|(sco:)|(sah:)|(srn:)|(stq:)|(szl:)" +
"|(tet:)|(tpi:)|(tum:)" +
"|(udm:)" +
"|(vec:)|(vls:)" +
"|(war:)|(wuu:)" +
"|(xal:)|(xmf:)" +
"|(zea:)|(zh-yue:)|(zh-min-nan:)|(zh-classical:)" +
"|(simple:)";
protected String title;
protected StringBuilder rawText;
protected String staticText;
protected StringBuilder cleanText;
protected boolean isRedirect = false;
protected boolean isDisambig = false;
protected List<WikiLink> links = new ArrayList<WikiLink>();
protected Set<String> categories = new HashSet<String>();
protected StringBuilder infobox;
protected Double[] latLng;
protected static final Pattern whitesPattern = Pattern.compile("(<br ?/?>)|(\\s+)",
Pattern.DOTALL);
protected static final Pattern tagsPattern = Pattern.compile("<[^<>]+>",
Pattern.DOTALL);
protected static final Pattern urlPattern = Pattern.compile("((http(s)?://)|(www.))[/~\\?%#=_:&;\\+\\-@\\^\\.\\w]+",
Pattern.CASE_INSENSITIVE);
protected static final Pattern commentsPattern = Pattern.compile("((<)|(<))!--.*?--((>)|(>))",
Pattern.DOTALL);
protected static final Pattern otherMarkupPattern = Pattern.compile("(\\{\\|.*?\\|\\})|(\\*\\*+)|(''+)|(==+)",
Pattern.DOTALL);
protected static final Pattern templatePattern = Pattern.compile("(\\{\\{)|(\\}\\})");
protected static final Pattern linkPattern = Pattern.compile("\\[\\[(([^\\[\\]\\|]+)(\\|([^\\[\\]\\|]+))?)\\]\\]");
protected static final Pattern cleanPattern = Pattern.compile("\\([ ,.;-]*\\)");
protected static final Pattern infoboxStartPattern = Pattern.compile("\\{\\{(info|geo)box",
Pattern.CASE_INSENSITIVE);
protected final Pattern disambPattern;
protected final Pattern redirectPattern;
protected final Pattern mediaLinksPattern;
protected final Pattern categoryPattern;
protected final Pattern specialPattern;
protected final Pattern disambTitlePattern;
protected final Pattern ignoreTitlePattern;
public AbstractWikiPageCleaner(String title, StringBuilder rawText)
{
this.title = title;
this.rawText = rawText;
this.staticText = "";
this.disambPattern = getDisambTemplatePattern();
this.redirectPattern = getRedirectPattern();
this.mediaLinksPattern = getMediaLinksPattern();
this.categoryPattern = getCategoryPattern();
this.specialPattern = getSpecialPattern();
this.disambTitlePattern = getDisambTitlePattern();
this.ignoreTitlePattern = getIgnoreTitlePattern();
process();
}
protected abstract Pattern getDisambTemplatePattern();
protected abstract Pattern getRedirectPattern();
protected abstract Pattern getMediaLinksPattern();
protected abstract Pattern getCategoryPattern();
protected abstract Pattern getSpecialPattern();
protected abstract Pattern getIgnoreTitlePattern();
protected abstract Pattern getDisambTitlePattern();
public abstract boolean isMediaLink(String text);
/**
*
*
*
*/
private void process() {
Matcher matcher = null;
if((matcher = redirectPattern.matcher(rawText)).find()) {
isRedirect = true;
String anchorTarget = matcher.group(2).trim().replace(' ', '_');
//ignore links to special pages
if (anchorTarget.length() > 0
&& !specialPattern.matcher(anchorTarget).matches()){
WikiLink wl = new WikiLink();
wl.setAnchorTarget( anchorTarget );
links.add(wl);
}
} else if (disambPattern.matcher(rawText).find()) {
isDisambig = true;
matcher = linkPattern.matcher(rawText);
while(matcher.find()) {
String anchorTarget = matcher.group(2).trim().replace(' ', '_');
//ignore links to special pages
if (anchorTarget.length() == 0
|| specialPattern.matcher(anchorTarget).matches()) continue;
WikiLink wl = new WikiLink();
wl.setAnchorTarget( anchorTarget );
links.add(wl);
}
} else {
//get infobox content
this.infobox = getInfobox(rawText);
//remove templates
StringBuilder step0 = removeTemplates(rawText);
//remove image and file links
StringBuilder step05 = removeMediaLinks(step0);
//remove html comments
StringBuffer step1 = new StringBuffer();
matcher = commentsPattern.matcher(step05);
while(matcher.find()) {
matcher.appendReplacement(step1, "");
}
matcher.appendTail(step1);
//remove misc elements (eg, quotes)
StringBuffer step2 = new StringBuffer();
matcher = otherMarkupPattern.matcher(step1);
while(matcher.find()) {
matcher.appendReplacement(step2, "");
}
matcher.appendTail(step2);
//replace html tags by whitespace
StringBuffer step3 = new StringBuffer();
matcher = tagsPattern.matcher(step2);
while(matcher.find()) {
matcher.appendReplacement(step3, " ");
}
matcher.appendTail(step3);
//clean the text
StringBuffer step35 = new StringBuffer();
matcher = cleanPattern.matcher(step3);
while(matcher.find()) {
matcher.appendReplacement(step35, "");
}
matcher.appendTail(step35);
// normalize whites
StringBuffer step4 = new StringBuffer();
matcher = whitesPattern.matcher(step35);
while(matcher.find()) {
matcher.appendReplacement(step4, " ");
}
matcher.appendTail(step4);
//replace urls with a token
StringBuffer step5 = new StringBuffer();
matcher = urlPattern.matcher(step4);
while(matcher.find()) {
matcher.appendReplacement(step5, "URLTOKEN");
}
matcher.appendTail(step5);
// Extract and Clean Links / Categories
this.cleanText = new StringBuilder(step5);
matcher = linkPattern.matcher(step5);
int offset = 0;
while(matcher.find()) {
String anchorTarget = matcher.group(2).trim();
if (anchorTarget.length() == 0) continue;
String anchorText = matcher.group(4);
if (anchorText == null) {
anchorText = anchorTarget;
} else {
Matcher whitesMatcher = whitesPattern.matcher(anchorText);
StringBuffer sb = new StringBuffer();
while(whitesMatcher.find()) {
whitesMatcher.appendReplacement(sb, " ");
}
whitesMatcher.appendTail(sb);
anchorText = sb.toString().trim();
}
Matcher specialMatcher = null;
if ((specialMatcher = categoryPattern.matcher(anchorTarget)).matches()) {
anchorText = specialMatcher.group(1);
categories.add(anchorText);
} else if ((specialMatcher = specialPattern.matcher(anchorTarget)).matches()) {
anchorText = "";
} else {
WikiLink wl = new WikiLink();
wl.setStart( matcher.start() - offset );
wl.setLength( anchorText.length() );
wl.setAnchorTarget( anchorTarget );
wl.setAnchorText( anchorText );
links.add(wl);
}
cleanText.delete(matcher.start() - offset, matcher.end() - offset);
cleanText.insert(matcher.start() - offset, anchorText);
offset += matcher.group().length() - anchorText.length();
}
// WARNING: DO NOT MODIFY THE TEXT BEYOND THIS POINT OR THE OFFSETS WILL BE WRONG
staticText = cleanText.toString();
}
}
/**
*
* TODO Improve how unclosed templates are handled...
*
*
* @param before
* @return
*/
private StringBuilder removeTemplates(CharSequence before) {
StringBuilder after = new StringBuilder(before);
Matcher tMatcher = templatePattern.matcher(before);
int offset = 0;
int start = -1;
int end = -1;
int openCount = 0;
int closeCount = 0;
while (tMatcher.find()) {
if (tMatcher.group().equals("{{")) {
if (openCount == 0) { start = tMatcher.start(); }
openCount++;
} else if (openCount > 0){
closeCount++;
if (openCount - closeCount == 0) {
end = tMatcher.end();
after.delete(start - offset, end - offset);
offset += (end - start);
openCount = 0;
closeCount = 0;
}
}
}
return after;
}
/**
*
* TODO Improve how unclosed templates are handled...
*
*
* @param before
* @return
*/
private StringBuilder removeMediaLinks(CharSequence before) {
StringBuilder after = new StringBuilder(before);
Matcher tMatcher = mediaLinksPattern.matcher(before);
int offset = 0;
int start = -1;
int end = -1;
int openCount = 0;
int closeCount = 0;
boolean inMediaLink = false;
while (tMatcher.find()) {
if (tMatcher.group().startsWith("[[")) {
if (isMediaLink(tMatcher.group())) {
inMediaLink = true;
start = tMatcher.start();
}
if (inMediaLink) openCount++;
} else if (openCount > 0){
closeCount++;
if (openCount - closeCount == 0) {
end = tMatcher.end();
after.delete(start - offset, end - offset);
offset += (end - start);
openCount = 0;
closeCount = 0;
inMediaLink = false;
}
}
}
return after;
}
/**
*
*
*
* @param raw
* @return
*/
private StringBuilder getInfobox(CharSequence raw) {
StringBuilder infobox = new StringBuilder();
Matcher infoMatcher = infoboxStartPattern.matcher(raw);
int openCount = 1;
int closeCount = 0;
if (infoMatcher.find()) {
int iStart = infoMatcher.start();
Matcher tMatcher = templatePattern.matcher(raw);
while (tMatcher.find()) {
int tStart = tMatcher.start();
if (tStart <= iStart) continue;
if (tMatcher.group().equals("{{")) {
openCount++;
} else {
closeCount++;
if (openCount - closeCount == 0) {
infobox.append(raw, iStart, tMatcher.end());
return infobox;
}
}
}
}
return infobox;
}
public String getCleanText() {
return staticText;
}
public boolean isRedirect() {
return isRedirect;
}
public boolean isDisambiguation() {
return isDisambig;
}
public List<WikiLink> getLinks() {
return links;
}
public Set<String> getCategories() {
return categories;
}
public String getInfobox() {
return infobox != null ? infobox.toString() : "";
}
public Double[] getLatLng() {
if (latLng == null) latLng = EnglishWikiCoordinatesUtil.INSTANCE.getCoordinates(getInfobox());
return latLng;
}
public Long getPopulation() {
return EnglishWikiPopulationUtil.INSTANCE.getPopulation(getInfobox());
}
public Float getArea() {
return EnglishWikiAreaUtil.INSTANCE.getArea(getInfobox());
}
public boolean ignore() {
return ignoreTitlePattern.matcher(title).find();
}
}