Package dmir.wikipedia

Examples of dmir.wikipedia.WikiLink


            //ignore links to special pages
            if (anchorTarget.length() > 0
                && !specialPattern.matcher(anchorTarget).matches()){

                WikiLink wl = new WikiLink();
                wl.setAnchorTarget( anchorTarget );
                links.add(wl);
            }
           
        } else if (disambPattern.matcher(rawText).find()) {
           
            isDisambig = true;
           
            matcher = linkPattern.matcher(rawText);
            while(matcher.find()) {

                String anchorTarget = matcher.group(2).trim().replace(' ', '_');
               
                //ignore links to special pages
                if (anchorTarget.length() == 0
                    || specialPattern.matcher(anchorTarget).matches()) continue;

                WikiLink wl = new WikiLink();
                wl.setAnchorTarget( anchorTarget );
                links.add(wl);
            }
        } else {
            //get infobox content
            this.infobox = getInfobox(rawText);
           
           
            //remove templates
            StringBuilder step0 = removeTemplates(rawText);
           
            //remove image and file links
            StringBuilder step05 = removeMediaLinks(step0);
           
            //remove html comments
            StringBuffer step1 = new StringBuffer();
            matcher = commentsPattern.matcher(step05);
            while(matcher.find()) {
                matcher.appendReplacement(step1, "");
            }
            matcher.appendTail(step1);
           
            //remove misc elements (eg, quotes)
            StringBuffer step2 = new StringBuffer();
            matcher = otherMarkupPattern.matcher(step1);
            while(matcher.find()) {
                matcher.appendReplacement(step2, "");
            }
            matcher.appendTail(step2);
           
            //replace html tags by whitespace
            StringBuffer step3 = new StringBuffer();
            matcher = tagsPattern.matcher(step2);
            while(matcher.find()) {
                matcher.appendReplacement(step3, " ");
            }
            matcher.appendTail(step3);
           
            //clean the text
            StringBuffer step35 = new StringBuffer();
            matcher = cleanPattern.matcher(step3);
            while(matcher.find()) {
                matcher.appendReplacement(step35, "");
            }
            matcher.appendTail(step35);
           
            // normalize whites
            StringBuffer step4 = new StringBuffer();
            matcher = whitesPattern.matcher(step35);
            while(matcher.find()) {
                matcher.appendReplacement(step4, " ");
            }
            matcher.appendTail(step4);

            //replace urls with a token
            StringBuffer step5 = new StringBuffer();
            matcher = urlPattern.matcher(step4);
            while(matcher.find()) {
                matcher.appendReplacement(step5, "URLTOKEN");
            }
            matcher.appendTail(step5);
           
            // Extract and Clean Links / Categories
            this.cleanText = new StringBuilder(step5);
            matcher = linkPattern.matcher(step5);
            int offset = 0;
            while(matcher.find()) {
                String anchorTarget = matcher.group(2).trim();
                if (anchorTarget.length() == 0) continue;
               
                String anchorText = matcher.group(4);
                if (anchorText == null) {
                    anchorText = anchorTarget;
                } else {
                    Matcher whitesMatcher = whitesPattern.matcher(anchorText);
                    StringBuffer sb = new StringBuffer();
                    while(whitesMatcher.find()) {
                        whitesMatcher.appendReplacement(sb, " ");
                    }
                    whitesMatcher.appendTail(sb);
                    anchorText = sb.toString().trim();
                }
               
                Matcher specialMatcher = null;
                if ((specialMatcher = categoryPattern.matcher(anchorTarget)).matches()) {
                    anchorText = specialMatcher.group(1);
                    categories.add(anchorText);
                } else if ((specialMatcher = specialPattern.matcher(anchorTarget)).matches()) {
                    anchorText = "";
                } else {
                    WikiLink wl = new WikiLink();
                    wl.setStart( matcher.start() - offset );
                    wl.setLength( anchorText.length() );
                    wl.setAnchorTarget( anchorTarget );
                    wl.setAnchorText( anchorText );

                    links.add(wl);
                }
               
                cleanText.delete(matcher.start() - offset, matcher.end() - offset);
View Full Code Here

TOP

Related Classes of dmir.wikipedia.WikiLink

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.