Package pignlproc.markup

Source Code of pignlproc.markup.AnnotatingMarkupParser$CountingAppendable

package pignlproc.markup;

import info.bliki.htmlcleaner.ContentToken;
import info.bliki.htmlcleaner.TagNode;
import info.bliki.wiki.filter.ITextConverter;
import info.bliki.wiki.filter.WPList;
import info.bliki.wiki.filter.WPTable;
import info.bliki.wiki.model.Configuration;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.model.ImageFormat;
import info.bliki.wiki.model.WikiModel;
import info.bliki.wiki.tags.WPATag;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Parse mediawiki markup to strip the formatting info and extract a simple text
* version suitable for NLP along with header, paragraph and link position
* annotations.
*
* Use the {@code #convert(String)} and {@code #getWikiLinks()} methods.
*
* Due to the constraints imposed by the {@code ITextConverter} /
* {@code WikiModel} API, this class is not thread safe: only one instance
* should be run by thread.
*/
public class AnnotatingMarkupParser implements ITextConverter {

    public static final String HREF_ATTR_KEY = "href";

    public static final String WIKILINK_TITLE_ATTR_KEY = "title";

    public static final String WIKILINK_TARGET_ATTR_KEY = "href";

    public static final String WIKIOBJECT_ATTR_KEY = "wikiobject";

    public static final Set<String> PARAGRAPH_TAGS = new HashSet<String>(
            Arrays.asList("p"));

    public static final Set<String> HEADING_TAGS = new HashSet<String>(
            Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));

    public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*");

    protected final List<Annotation> wikilinks = new ArrayList<Annotation>();

    protected final List<Annotation> headers = new ArrayList<Annotation>();

    protected final List<Annotation> paragraphs = new ArrayList<Annotation>();

    protected String languageCode = "en";

    protected final WikiModel model;

    protected String redirect;

    protected String text;

    protected static final Pattern REDIRECT_PATTERN = Pattern.compile("^#REDIRECT \\[\\[([^\\]]*)\\]\\]");

    public AnnotatingMarkupParser() {
        model = makeWikiModel(languageCode);
    }

    public AnnotatingMarkupParser(String languageCode) {
        this.languageCode = languageCode;
        model = makeWikiModel(languageCode);
    }

    public WikiModel makeWikiModel(String languageCode) {
        return new WikiModel(String.format(
                "http://%s.wikipedia.org/wiki/${image}", languageCode),
                String.format("http://%s.wikipedia.org/wiki/${title}",
                        languageCode)) {
            @Override
            public String getRawWikiContent(String namespace,
                    String articleName, Map<String, String> templateParameters) {
                // disable template support
                // TODO: we need to readd template support at least for dates
                return "";
            }
        };
    }

    /**
     * Convert WikiMarkup to a simple text representation suitable for NLP
     * analysis. The links encountered during the extraction are then available
     * by calling {@code #getWikiLinks()}.
     *
     * @param rawWikiMarkup
     * @return the simple text without the markup
     */
    public String parse(String rawWikiMarkup) {
        Matcher matcher = REDIRECT_PATTERN.matcher(rawWikiMarkup);
        if (matcher.find()) {
            redirect = titleToUri(matcher.group(1), languageCode);
        } else {
            redirect = null;
        }
        wikilinks.clear();
        headers.clear();
        paragraphs.clear();
        text = model.render(this, rawWikiMarkup);
        return text;
    }

    public static String titleToUri(String title, String languageCode) {
        try {
            return String.format("http://%s.wikipedia.org/wiki/%s",
                    languageCode,
                    URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8"));
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    public void nodesToText(List<? extends Object> nodes, Appendable buffer,
            IWikiModel model) throws IOException {
        CountingAppendable countingBuffer;
        if (buffer instanceof CountingAppendable) {
            countingBuffer = (CountingAppendable) buffer;
        } else {
            // wrap
            countingBuffer = new CountingAppendable(buffer);
        }

        if (nodes != null && !nodes.isEmpty()) {
            try {
                int level = model.incrementRecursionLevel();
                if (level > Configuration.RENDERER_RECURSION_LIMIT) {
                    countingBuffer.append("Error - recursion limit exceeded"
                            + " rendering tags in PlainTextConverter#nodesToText().");
                    return;
                }
                for (Object node : nodes) {
                    if (node instanceof WPATag) {
                        // extract wikilink annotations
                        WPATag tag = (WPATag) node;
                        String wikilinkLabel = (String) tag.getAttributes().get(
                                WIKILINK_TITLE_ATTR_KEY);
                        String wikilinkTarget = (String) tag.getAttributes().get(
                                WIKILINK_TARGET_ATTR_KEY);
                        if (wikilinkLabel != null) {
                            int colonIdx = wikilinkLabel.indexOf(':');
                            if (colonIdx == -1) {
                                // do not serialize non-topic wiki-links such as
                                // translation links missing from the
                                // INTERWIKI_LINK map
                                int start = countingBuffer.currentPosition;
                                tag.getBodyString(countingBuffer);
                                int end = countingBuffer.currentPosition;
                                if (!wikilinkTarget.startsWith("#")) {
                                    wikilinks.add(new Annotation(start, end, wikilinkLabel, wikilinkTarget));
                                }
                            }
                        } else {
                            tag.getBodyString(countingBuffer);
                        }

                    } else if (node instanceof ContentToken) {
                        ContentToken contentToken = (ContentToken) node;
                        countingBuffer.append(contentToken.getContent());
                    } else if (node instanceof List) {
                    } else if (node instanceof WPList) {
                    } else if (node instanceof WPTable) {
                        // ignore lists and tables since they most of the time
                        // do not hold grammatically correct
                        // interesting sentences that are representative of the
                        // language.
                    } else if (node instanceof TagNode) {
                        TagNode tagNode = (TagNode) node;
                        Map<String, String> attributes = tagNode.getAttributes();
                        Map<String, Object> oAttributes = tagNode.getObjectAttributes();
                        boolean hasSpecialHandling = false;
                        String tagName = tagNode.getName();
                        int tagBegin = countingBuffer.currentPosition;
                        if ("a".equals(tagName)) {
                            String href = attributes.get(HREF_ATTR_KEY);
                            if (href != null
                                    && INTERWIKI_PATTERN.matcher(href).matches()) {
                                // ignore the interwiki links since they are
                                // mostly used for translation purpose.
                                hasSpecialHandling = true;
                            }
                        } else if ("ref".equals(tagName)) {
                            // ignore the references since they do not hold
                            // interesting text content
                            hasSpecialHandling = true;
                        } else if (oAttributes != null
                                && oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) {
                            // the caption of images often holds well formed
                            // sentences with links to entities
                            hasSpecialHandling = true;
                            ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
                            imageNodeToText(tagNode, iformat, countingBuffer,
                                    model);
                        }
                        if (!hasSpecialHandling) {
                            nodesToText(tagNode.getChildren(), countingBuffer,
                                    model);
                        }
                        if (PARAGRAPH_TAGS.contains(tagName)) {
                            paragraphs.add(new Annotation(tagBegin,
                                    countingBuffer.currentPosition,
                                    "paragraph", tagName));
                            countingBuffer.append("\n");
                        } else if (HEADING_TAGS.contains(tagName)) {
                            headers.add(new Annotation(tagBegin,
                                    countingBuffer.currentPosition, "heading",
                                    tagName));
                            countingBuffer.append("\n\n");
                        }
                    }
                }
            } finally {
                model.decrementRecursionLevel();
            }
        }
    }

    public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat,
            Appendable buffer, IWikiModel model) throws IOException {
        nodesToText(tagNode.getChildren(), buffer, model);
    }

    public boolean noLinks() {
        return true;
    }

    public List<Annotation> getWikiLinkAnnotations() {
        return wikilinks;
    }

    public List<Annotation> getHeaderAnnotations() {
        return headers;
    }

    public List<Annotation> getParagraphAnnotations() {
        return paragraphs;
    }

    public List<String> getParagraphs() {
        List<String> texts = new ArrayList<String>();
        for (Annotation p : paragraphs) {
            texts.add(text.substring(p.begin, p.end));
        }
        return texts;
    }

    public List<String> getHeaders() {
        List<String> texts = new ArrayList<String>();
        for (Annotation h : headers) {
            texts.add(text.substring(h.begin, h.end));
        }
        return texts;
    }

    public String getRedirect() {
        return redirect;
    }

    public class CountingAppendable implements Appendable {

        public int currentPosition = 0;

        final protected Appendable wrappedBuffer;

        public CountingAppendable(Appendable wrappedBuffer) {
            this.wrappedBuffer = wrappedBuffer;
        }

        public Appendable append(CharSequence charSeq) throws IOException {
            currentPosition += charSeq.length();
            return wrappedBuffer.append(charSeq);
        }

        public Appendable append(char aChar) throws IOException {
            currentPosition += 1;
            return wrappedBuffer.append(aChar);
        }

        public Appendable append(CharSequence charSeq, int start, int end)
                throws IOException {
            currentPosition += end - start;
            return wrappedBuffer.append(charSeq, start, end);
        }

    }

}
TOP

Related Classes of pignlproc.markup.AnnotatingMarkupParser$CountingAppendable

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.