Package dmir.wikipedia.cleaners

Source Code of dmir.wikipedia.cleaners.AbstractWikiPageCleaner

package dmir.wikipedia.cleaners;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import dmir.wikipedia.WikiLink;
import dmir.wikipedia.addons.EnglishWikiAreaUtil;
import dmir.wikipedia.addons.EnglishWikiCoordinatesUtil;
import dmir.wikipedia.addons.EnglishWikiPopulationUtil;

public abstract class AbstractWikiPageCleaner {

  protected static final String WIKIPEDIA_LANGUAGE_PREFIXES =
          "|(aa:)|(ab:)|(af:)|(am:)|(an:)|(ar:)|(as:)|(av:)|(ay:)|(az:)" +
          "|(ba:)|(be:)|(bg:)|(bh:)|(bi:)|(bm:)|(bn:)|(bg:)|(bo:)|(br:)|(bs:)" +
          "|(ca:)|(ce:)|(ch:)|(co:)|(cr:)|(cs:)|(cv:)|(cy:)|(cu:)" +
          "|(da:)|(de:)|(dk:)|(dv:)|(dz:)" +
          "|(ee:)|(el:)|(eo:)|(es:)|(et:)|(eu:)" + // "|(en:)" +
          "|(fa:)|(ff:)|(fi:)|(fj:)|(fo:)|(fr:)|(fy:)" +
          "|(ga:)|(gd:)|(gl:)|(gn:)|(gu:)|(gv:)" +
          "|(ha:)|(he:)|(hi:)|(hr:)|(ht:)|(hu:)|(hy:)" +
          "|(ia:)|(id:)|(ie:)|(ig:)|(ii:)|(ik:)|(io:)|(is:)|(it:)|(iu:)" +
          "|(ja:)|(jv:)" +
          "|(ka:)|(kg:)|(ki:)|(kk:)|(kl:)|(km:)|(kn:)|(ko:)|(ks:)|(ku:)|(kv:)|(kw:)|(ky:)" +
          "|(la:)|(lb:)|(li:)|(ln:)|(lo:)|(lt:)|(lv:)" +
          "|(mg:)|(mh:)|(mi:)|(mk:)|(ml:)|(mn:)|(mo:)|(mr:)|(ms:)|(mt:)|(my:)" +
          "|(na:)|(nb:)|(ne:)|(ng:)|(nl:)|(nn:)|(no:)|(nv:)|(ny:)" +
          "|(oc:)|(or:)|(os:)" +
          "|(pa:)|(pl:)|(ps:)|(pt:)" +
          "|(qu:)" +
          "|(rm:)|(rn:)|(ro:)|(ru:)|(rw:)" +
          "|(sa:)|(sc:)|(sd:)|(se:)|(sg:)|(sh:)|(si:)|(sk:)|(sl:)|(sm:)|(sn:)|(so:)|(sq:)|(sr:)|(ss:)|(st:)|(su:)|(sv:)|(sw:)" +
          "|(ta:)|(te:)|(tg:)|(th:)|(ti:)|(tk:)|(tl:)|(tn:)|(to:)|(tr:)|(ts:)|(tt:)|(tw:)|(ty:)" +
          "|(ug:)|(uk:)|(ur:)|(uz:)" +
          "|(ve:)|(vi:)|(vo:)" +
          "|(wa:)|(wo:)" +
          "|(xh:)" +
          "|(yi:)|(yo:)" +
          "|(za:)|(zh:)|(zu:)" +
          "|(ace:)|(als:)|(ang:)|(arc:)|(ast:)|(arz:)" +
          "|(bar:)|(bat-smg:)|(bcl:)|(bjn:)|(bpy:)|(be-x-old:)|(bug:)" +
          "|(cdo:)|(ceb:)|(chr:)|(chy:)|(ckb:)|(crh:)|(csb:)" +
          "|(diq:)|(dsb:)" +
          "|(eml:)|(ext:)" +
          "|(fiu-vro:)|(frp:)|(frr:)|(fur:)" +
          "|(gan:)|(gag:)|(got:)" +
          "|(haw:)|(hif:)|(hak:)|(hsb:)" +
          "|(ilo:)" +
          "|(jbo:)" +
          "|(kaa:)|(kab:)|(kbd:)|(koi:)|(krc:)|(ksh:)" +
          "|(lad:)|(lbe:)|(lij:)|(lmo:)|(ltg:)" +
          "|(map-bms:)|(mdf:)|(mhr:)|(mrj:)|(mwl:)|(mzn:)" +
          "|(nds-nl:)" +
          "|(nah:)|(nap:)|(nds:)|(new:)|(nov:)|(nrm:)|(nso:)" +
          "|(pag:)|(pam:)|(pap:)|(pcd:)|(pdc:)|(pfl:)|(pih:)|(pms:)|(pnb:)|(pnt:)" +
          "|(roa-rup:)|(roa-tara:)|(rmy:)|(rue:)" +
          "|(scn:)|(sco:)|(sah:)|(srn:)|(stq:)|(szl:)" +
          "|(tet:)|(tpi:)|(tum:)" +
          "|(udm:)" +
          "|(vec:)|(vls:)" +
          "|(war:)|(wuu:)" +
          "|(xal:)|(xmf:)" +
          "|(zea:)|(zh-yue:)|(zh-min-nan:)|(zh-classical:)" +
          "|(simple:)";
 
    protected String title;
    protected StringBuilder rawText;
    protected String staticText;
    protected StringBuilder cleanText;
 
    protected boolean isRedirect = false;
    protected boolean isDisambig = false;
    protected List<WikiLink> links = new ArrayList<WikiLink>();
    protected Set<String> categories = new HashSet<String>();
    protected StringBuilder infobox;
    protected Double[] latLng;
   
    protected static final Pattern whitesPattern = Pattern.compile("(<br ?/?>)|(\\s+)",
        Pattern.DOTALL);
    protected static final Pattern tagsPattern = Pattern.compile("<[^<>]+>",
        Pattern.DOTALL);
    protected static final Pattern urlPattern = Pattern.compile("((http(s)?://)|(www.))[/~\\?%#=_:&;\\+\\-@\\^\\.\\w]+",
            Pattern.CASE_INSENSITIVE);
    protected static final Pattern commentsPattern = Pattern.compile("((<)|(&lt;))!--.*?--((&gt;)|(>))",
        Pattern.DOTALL);
    protected static final Pattern otherMarkupPattern = Pattern.compile("(\\{\\|.*?\\|\\})|(\\*\\*+)|(''+)|(==+)",
        Pattern.DOTALL);
    protected static final Pattern templatePattern = Pattern.compile("(\\{\\{)|(\\}\\})");
    protected static final Pattern linkPattern = Pattern.compile("\\[\\[(([^\\[\\]\\|]+)(\\|([^\\[\\]\\|]+))?)\\]\\]");
    protected static final Pattern cleanPattern = Pattern.compile("\\([ ,.;-]*\\)");
    protected static final Pattern infoboxStartPattern = Pattern.compile("\\{\\{(info|geo)box",
            Pattern.CASE_INSENSITIVE);
   
    protected final Pattern disambPattern;
    protected  final Pattern redirectPattern;
    protected final Pattern mediaLinksPattern;
    protected final Pattern categoryPattern;
    protected final Pattern specialPattern;
    protected final Pattern disambTitlePattern;
    protected final Pattern ignoreTitlePattern;


    public AbstractWikiPageCleaner(String title, StringBuilder rawText)
    {
      this.title = title;
      this.rawText = rawText;
        this.staticText = "";
        this.disambPattern = getDisambTemplatePattern();
        this.redirectPattern = getRedirectPattern();
        this.mediaLinksPattern = getMediaLinksPattern();
        this.categoryPattern = getCategoryPattern();
        this.specialPattern = getSpecialPattern();
        this.disambTitlePattern = getDisambTitlePattern();
        this.ignoreTitlePattern = getIgnoreTitlePattern();
        process();
    }
 
   
  protected abstract Pattern getDisambTemplatePattern();
    protected abstract Pattern getRedirectPattern();
    protected abstract Pattern getMediaLinksPattern();
    protected abstract Pattern getCategoryPattern();
    protected abstract Pattern getSpecialPattern();
    protected abstract Pattern getIgnoreTitlePattern();
    protected abstract Pattern getDisambTitlePattern();
   
    public abstract boolean isMediaLink(String text);
   
   
  /**
   *
   *
   *
   */
    private void process() {
       
        Matcher matcher = null;
        if((matcher = redirectPattern.matcher(rawText)).find()) {
           
            isRedirect = true;
            String anchorTarget = matcher.group(2).trim().replace(' ', '_');

            //ignore links to special pages
            if (anchorTarget.length() > 0
                && !specialPattern.matcher(anchorTarget).matches()){

                WikiLink wl = new WikiLink();
                wl.setAnchorTarget( anchorTarget );
                links.add(wl);
            }
           
        } else if (disambPattern.matcher(rawText).find()) {
           
            isDisambig = true;
           
            matcher = linkPattern.matcher(rawText);
            while(matcher.find()) {

                String anchorTarget = matcher.group(2).trim().replace(' ', '_');
               
                //ignore links to special pages
                if (anchorTarget.length() == 0
                    || specialPattern.matcher(anchorTarget).matches()) continue;

                WikiLink wl = new WikiLink();
                wl.setAnchorTarget( anchorTarget );
                links.add(wl);
            }
        } else {
            //get infobox content
            this.infobox = getInfobox(rawText);
           
           
            //remove templates
            StringBuilder step0 = removeTemplates(rawText);
           
            //remove image and file links
            StringBuilder step05 = removeMediaLinks(step0);
           
            //remove html comments
            StringBuffer step1 = new StringBuffer();
            matcher = commentsPattern.matcher(step05);
            while(matcher.find()) {
                matcher.appendReplacement(step1, "");
            }
            matcher.appendTail(step1);
           
            //remove misc elements (eg, quotes)
            StringBuffer step2 = new StringBuffer();
            matcher = otherMarkupPattern.matcher(step1);
            while(matcher.find()) {
                matcher.appendReplacement(step2, "");
            }
            matcher.appendTail(step2);
           
            //replace html tags by whitespace
            StringBuffer step3 = new StringBuffer();
            matcher = tagsPattern.matcher(step2);
            while(matcher.find()) {
                matcher.appendReplacement(step3, " ");
            }
            matcher.appendTail(step3);
           
            //clean the text
            StringBuffer step35 = new StringBuffer();
            matcher = cleanPattern.matcher(step3);
            while(matcher.find()) {
                matcher.appendReplacement(step35, "");
            }
            matcher.appendTail(step35);
           
            // normalize whites
            StringBuffer step4 = new StringBuffer();
            matcher = whitesPattern.matcher(step35);
            while(matcher.find()) {
                matcher.appendReplacement(step4, " ");
            }
            matcher.appendTail(step4);

            //replace urls with a token
            StringBuffer step5 = new StringBuffer();
            matcher = urlPattern.matcher(step4);
            while(matcher.find()) {
                matcher.appendReplacement(step5, "URLTOKEN");
            }
            matcher.appendTail(step5);
           
            // Extract and Clean Links / Categories
            this.cleanText = new StringBuilder(step5);
            matcher = linkPattern.matcher(step5);
            int offset = 0;
            while(matcher.find()) {
                String anchorTarget = matcher.group(2).trim();
                if (anchorTarget.length() == 0) continue;
               
                String anchorText = matcher.group(4);
                if (anchorText == null) {
                    anchorText = anchorTarget;
                } else {
                    Matcher whitesMatcher = whitesPattern.matcher(anchorText);
                    StringBuffer sb = new StringBuffer();
                    while(whitesMatcher.find()) {
                        whitesMatcher.appendReplacement(sb, " ");
                    }
                    whitesMatcher.appendTail(sb);
                    anchorText = sb.toString().trim();
                }
               
                Matcher specialMatcher = null;
                if ((specialMatcher = categoryPattern.matcher(anchorTarget)).matches()) {
                    anchorText = specialMatcher.group(1);
                    categories.add(anchorText);
                } else if ((specialMatcher = specialPattern.matcher(anchorTarget)).matches()) {
                    anchorText = "";
                } else {
                    WikiLink wl = new WikiLink();
                    wl.setStart( matcher.start() - offset );
                    wl.setLength( anchorText.length() );
                    wl.setAnchorTarget( anchorTarget );
                    wl.setAnchorText( anchorText );

                    links.add(wl);
                }
               
                cleanText.delete(matcher.start() - offset, matcher.end() - offset);
                cleanText.insert(matcher.start() - offset, anchorText);
               
                offset += matcher.group().length() - anchorText.length();
            }
            // WARNING: DO NOT MODIFY THE TEXT BEYOND THIS POINT OR THE OFFSETS WILL BE WRONG

            staticText = cleanText.toString();
        }
    }
   
   
   
    /**
     *
     * TODO Improve how unclosed templates are handled...
     *
     *
     * @param before
     * @return
     */
    private StringBuilder removeTemplates(CharSequence before) {
       
        StringBuilder after = new StringBuilder(before);
       
        Matcher tMatcher = templatePattern.matcher(before);
       
        int offset = 0;
        int start = -1;
        int end = -1;
        int openCount = 0;
        int closeCount = 0;
       
        while (tMatcher.find()) {
            if (tMatcher.group().equals("{{")) {
               
                if (openCount == 0) { start = tMatcher.start(); }
                openCount++;
               
            } else if (openCount > 0){
                closeCount++;
                if (openCount - closeCount == 0) {
                    end = tMatcher.end();
                    after.delete(start - offset, end - offset);
                    offset += (end - start);
                   
                    openCount = 0;
                    closeCount = 0;
                }
            }
        }
       
        return after;
    }
   
   
   
    /**
     *
     * TODO Improve how unclosed templates are handled...
     *
     *
     * @param before
     * @return
     */
    private StringBuilder removeMediaLinks(CharSequence before) {
       
        StringBuilder after = new StringBuilder(before);
       
        Matcher tMatcher = mediaLinksPattern.matcher(before);
       
        int offset = 0;
        int start = -1;
        int end = -1;
        int openCount = 0;
        int closeCount = 0;
      
        boolean inMediaLink = false;
       
        while (tMatcher.find()) {
            if (tMatcher.group().startsWith("[[")) {
               
              if (isMediaLink(tMatcher.group())) {
                inMediaLink = true;
                start = tMatcher.start();
              }
             
                if (inMediaLink) openCount++;
               
            } else if (openCount > 0){
                closeCount++;
                if (openCount - closeCount == 0) {
                    end = tMatcher.end();
                    after.delete(start - offset, end - offset);
                    offset += (end - start);
                   
                    openCount = 0;
                    closeCount = 0;
                    inMediaLink = false;
                }
            }
        }
       
        return after;
    }
   
   
    /**
     *
     *
     *
     * @param raw
     * @return
     */
    private StringBuilder getInfobox(CharSequence raw) {
       
        StringBuilder infobox = new StringBuilder();
       
        Matcher infoMatcher = infoboxStartPattern.matcher(raw);
       
        int openCount = 1;
        int closeCount = 0;
       
        if (infoMatcher.find()) {
           
            int iStart = infoMatcher.start();
            Matcher tMatcher = templatePattern.matcher(raw);

            while (tMatcher.find()) {

                int tStart = tMatcher.start();
                if (tStart <= iStart) continue;
               
                if (tMatcher.group().equals("{{")) {

                    openCount++;

                } else {
                    closeCount++;
                    if (openCount - closeCount == 0) {
                        infobox.append(raw, iStart, tMatcher.end());
                        return infobox;
                    }
                }
            }
        }
       
        return infobox;
    }
   
 
    public String getCleanText() {
        return staticText;
    }
   
    public boolean isRedirect() {
        return isRedirect;
    }
   
    public boolean isDisambiguation() {
        return isDisambig;
    }
   
    public List<WikiLink> getLinks() {
        return links;
    }
   
    public Set<String> getCategories() {
        return categories;
    }
   
    public String getInfobox() {
        return infobox != null ? infobox.toString() : "";
    }
   
    public Double[] getLatLng() {
        if (latLng == null) latLng = EnglishWikiCoordinatesUtil.INSTANCE.getCoordinates(getInfobox());
        return latLng;
    }
   
    public Long getPopulation() {
      return EnglishWikiPopulationUtil.INSTANCE.getPopulation(getInfobox());
    }
   
    public Float getArea() {
      return EnglishWikiAreaUtil.INSTANCE.getArea(getInfobox());
    }
   
    public boolean ignore() {
      return ignoreTitlePattern.matcher(title).find();
    }

}
TOP

Related Classes of dmir.wikipedia.cleaners.AbstractWikiPageCleaner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.