Package net.bpiwowar.mg4j.extensions.trec

Source Code of net.bpiwowar.mg4j.extensions.trec.TRECTopic

package net.bpiwowar.mg4j.extensions.trec;

import bpiwowar.argparser.ListAdaptator;
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.parser.callback.DefaultCallback;
import net.bpiwowar.mg4j.extensions.query.*;
import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.List;
import java.util.Map;

import static net.bpiwowar.mg4j.extensions.trec.TRECParsingFactory.*;

/**
* A TREC topic
*
* @author B. Piwowarski <benjamin@bpiwowar.net>
*/
public class TRECTopic implements Topic {
    final static private Logger logger = Logger.getLogger(TRECTopic.class);

    protected String id;

    protected String narrative;

    protected String description;

    protected String title;

    protected String concepts;

    protected String definitions;

    protected String summary;

    public String getTitle() {
        return title;
    }

    public String getDescription() {
        return description;
    }

    public String getNarrative() {
        return narrative;
    }

    public String getSummary() {
        return summary;
    }

    public String getConcepts() {
        return concepts;
    }

    public String getDefinitions() {
        return definitions;
    }

    /**
     * Read a set of TREC topics
     *
     * @param reader
     * @param quoteCommas
     *            Should we quote word separated by commas?
     * @return
     * @throws IOException
     */
    static public QuerySet readTopics(BufferedReader reader,
                                      final boolean quoteCommas) throws IOException {
        logger.debug("Reading a topic file");
        final DefaultQuerySet querySet = new DefaultQuerySet();

        BulletParser bulletParser = new BulletParser(
                TRECParsingFactory.INSTANCE);

        bulletParser.setCallback(new DefaultCallback() {
            TRECTopic topic = null;
            MutableString curText = new MutableString();
            Element curElement;

            @Override
            public boolean characters(char[] text, int offset, int length,
                                      boolean flowBroken) {
                curText.append(text, offset, length);
                return true;
            }

            @Override
            public boolean startElement(Element element,
                                        Map<Attribute, MutableString> attrMapUnused) {

                // --- New tag
                if (topic != null)
                    process();

                // ---
                if (element == ELEMENT_TOP) {
                    topic = new TRECTopic();
                }
                curElement = element;
                return true;
            }

            void removePrefix(String prefix, MutableString text) {
                if (text.startsWith(prefix))
                    text.delete(0, prefix.length());

            }

            private void process() {
                curText.trim();
                curText.replace('\n', ' ');
                curText.squeezeSpaces(false);

                if (curElement == ELEMENT_TITLE) {
                    removePrefix("Topic: ", curText);
                    if (quoteCommas) {
                        StringBuilder builder = new StringBuilder();
                        boolean first = true;
                        for (String part : curText.toString()
                                .split("\\s*,\\s*")) {
                            if (first)
                                first = false;
                            else
                                builder.append(' ');

                            if (part.indexOf(' ') >= 0) {
                                builder.append('"');
                                builder.append(part);
                                builder.append('"');
                            } else
                                builder.append(part);
                        }

                        topic.title = builder.toString();
                    } else
                        topic.title = curText.toString();
                } else if (curElement == ELEMENT_NUM) {
                    removePrefix("Number: ", curText);
                    // Normalise the number
                    topic.id = new Integer(curText.toString()).toString();
                } else if (curElement == ELEMENT_DESC) {
                    removePrefix("Description: ", curText);
                    topic.description = curText.toString();
                } else if (curElement == ELEMENT_NARR) {
                    // TREC
                    removePrefix("Narrative: ", curText);
                    topic.narrative = curText.toString();
                } else if (curElement == ELEMENT_SMRY) {
                    removePrefix("Summary: ", curText);
                    topic.summary = curText.toString();
                } else if (curElement == ELEMENT_CON) {
                    // TREC 1
                    removePrefix("Concepts: ", curText);
                    topic.concepts = curText.toString();
                } else if (curElement == ELEMENT_DEF) {
                    // TREC 1
                    removePrefix("Definition(s): ", curText);
                    removePrefix("Definition: ", curText);
                    topic.definitions = curText.toString();
                }
                curElement = null;
                curText.delete(0, curText.length());
            }

            @Override
            public boolean endElement(Element element) {
                if (topic != null)
                    process();

                if (element == ELEMENT_TOP) {
                    if (topic.id == null) {
                        logger.warn("Topic had no identifier - skipping");
                    } else {
                        logger.debug(String.format("Adding topic %s with title [%s]",
                                topic.id, topic.title));

                        querySet.put(topic.id, topic);
                    }
                    topic = null;
                }
                return true;
            }
        });

        // Read the file & parse
        char text[] = new char[8192];
        int offset = 0, l;
        while ((l = reader.read(text, offset, text.length - offset)) > 0) {
            offset += l;
            text = CharArrays.grow(text, offset + 1);
        }

        bulletParser.parseText(true);
        bulletParser.parseCDATA(true);
        bulletParser.parseTags(true);
        bulletParser.parse(text, 0, offset);

        return querySet;

    }

    @Override
    public String getId() {
        return id;
    }

    final static List<String> list = ListAdaptator.create(new String[]{
            "title", "desc"});

    @Override
    public Query getTopicPart(String type) {
        if ("title".equals(type))
            return new StringQuery(title);

        if ("desc".equals(type))
            return new StringQuery(description);

        return null;
    }

    @Override
    public List<String> getTypes() {
        return list;
    }

}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.trec.TRECTopic

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.