Source Code of edu.ucla.sspace.dv.MediumPennTemplateAcceptor

/*
 * Copyright 2010 David Jurgens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.dv;


import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyPathAcceptor;
import edu.ucla.sspace.dependency.DependencyTreeNode;


import edu.ucla.sspace.text.IteratorFactory;


import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;




/**      
 * A {@code DependencyPathAcceptor} that accepts the minimum set of path
 * templates specified by <a
 * href="http://www.nlpado.de/~sebastian/pub/papers/cl07_pado.pdf">Padó and
 * Lapata (2007)</a>.  This acceptor is designed to be used with the Penn
 * Treebank part of speech <a
 * href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">tag
 * set</a> and dependency relations from the pre-1.4 Malt parser (not the
 * Stanford typed dependencies).  Note that this template's patters is an
 * <i>adaptation</i> of the the original patterns, which were specified using
 * the Minipar relations and part of speech tags.
 *
 * @see MinimumTemplateAcceptor
 * @see MaximumTemplateAcceptor
 */
public class MediumPennTemplateAcceptor implements DependencyPathAcceptor {


    static final Set<String> MEDIUM_TEMPLATES = new HashSet<String>();


    /**
     * A mapping from a specific POS tag, e.g. NN, JJS, to the general
     * <i>class</i> of part of speech tags, e.g. noun (N), to which it belongs.
     */
    static final Map<String,String> POS_TAG_TO_CLASS = 
        new HashMap<String,String>();


    // Static block for initializing the POS_TAGS_TO_CLASS mapping using the
    // PennTags class
    static {
        // NOTE: the class tags are intentionally short to facilitate faster
        // matching
        for (String noun : PennTags.NOUN_POS_TAGS)
            POS_TAG_TO_CLASS.put(noun, "N"); // Noun
        for (String adj : PennTags.ADJ_POS_TAGS)
            POS_TAG_TO_CLASS.put(adj, "J"); // adJective
        for (String adv : PennTags.ADV_POS_TAGS)
            POS_TAG_TO_CLASS.put(adv, "R"); // adveRb
        for (String verb : PennTags.VERB_POS_TAGS)
            POS_TAG_TO_CLASS.put(verb, "V"); // Verb
    }
    
    /**
     * A mapping from a relation to the more general class of relations to which
     * it belongs, e.g. AMOD and PMOD would be mapped to a "lexical modifier"
     * relation class.
     */
    static final Map<String,String> REL_TO_CLASS =
        new HashMap<String,String>();
    
    // Static block for intializing REL_TO_CLASS
    static {
        for (String mod : PennTags.MODIFIERS)
            REL_TO_CLASS.put(mod, "mod");
    }
    
    // Static block for initializing the medium patterns.  Note that this block
    // uses the shorted class labels for parts of speech, e.g. NNS, NP, NN -> N,
    // in order to handle the combinatorial explosion of patterns that would
    // need to be expressed when moving from the Minipar to Penn tag sets.
    static {
        MEDIUM_TEMPLATES.add("J:nmod:N,N:amod:(null)");
        MEDIUM_TEMPLATES.add("J:nmod:N,N:vmod:(null)");
        MEDIUM_TEMPLATES.add("J:nmod:N,N:nmod:(null)");
        MEDIUM_TEMPLATES.add("J:nmod:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("J:sbj:N,N:amod:(null)");
        MEDIUM_TEMPLATES.add("J:sbj:N,N:nmod:(null)");
        MEDIUM_TEMPLATES.add("J:sbj:N,N:vmod:(null)");
        MEDIUM_TEMPLATES.add("J:sbj:N,N:nmod:N");


        MEDIUM_TEMPLATES.add("R:nmod:N,N:amod:(null)");
        MEDIUM_TEMPLATES.add("R:nmod:N,N:vmod:(null)");
        MEDIUM_TEMPLATES.add("R:nmod:N,N:nmod:(null)");
        MEDIUM_TEMPLATES.add("R:nmod:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("R:sbj:N,N:amod:(null)");
        MEDIUM_TEMPLATES.add("R:sbj:N,N:vmod:(null)");
        MEDIUM_TEMPLATES.add("R:sbj:N,N:nmod:(null)");
        MEDIUM_TEMPLATES.add("R:sbj:N,N:nmod:N");




        MEDIUM_TEMPLATES.add("N:coord:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("N:coord:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("N:gen:N,N:nmod:(null)");
        MEDIUM_TEMPLATES.add("N:gen:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:coord:N");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:coord:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:gen:N");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:gen:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:mod:A");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:mod:TO");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:obj:V");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:prd:V");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:sbj:A");
        MEDIUM_TEMPLATES.add("N:nmod:N,N:sbj:V");


        MEDIUM_TEMPLATES.add("(null):mod:N,N:coord:N");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:coord:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:gen:N");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:gen:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:amod:J");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:adv:R");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:pmod:TO");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:obj:V");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:prd:V");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:sbj:J");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:sbj:R");
        MEDIUM_TEMPLATES.add("(null):mod:N,N:sbj:V");


        MEDIUM_TEMPLATES.add("TO:mod:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("TO:mod:N,N:nmod:N");


        MEDIUM_TEMPLATES.add("V:obj:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("V:obj:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("V:sbj:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("V:sbj:N,N:nmod:N");
        MEDIUM_TEMPLATES.add("V:prd:N,N:mod:(null)");
        MEDIUM_TEMPLATES.add("V:prd:N,N:nmod:N");
    }
    
    /**
     * Creates the acceptor with its standard templates
     */
    public MediumPennTemplateAcceptor() { }
   
    /**
     * Returns {@code true} if the path matches one of the predefined templates
     *
     * @param path a dependency path
     *
     * @return {@code true} if the path matches a template
     */
    public boolean accepts(DependencyPath path) {
        return acceptsInternal(path);
    }


    /**
     * A package-private method that checks whether the path matches any of the
     * predefined templates.  This method is provided so other template classes
     * have access to the accept logic used by this class.
     *
     * @param path a dependency path
     *
     * @return {@code true} if the path matches a template
     */
    static boolean acceptsInternal(DependencyPath path) {
        // First check whether the minimum template acceptor would allow this
        // path
        if (MinimumPennTemplateAcceptor.acceptsInternal(path))
            return true;


        // Filter out paths that can't match the template due to length
        if (path.length() > 3)
            return false;


        int pathLength = path.length();


        // The medium set of templates contains "null" matches which are wild
        // cards against any part of speech.  We handle these by generating
        // three possible pattern instances that represent the provided path,
        // two of which include the wildcard "null", one each end.  If any of
        // these patterns are found in the medium set, the path is valid.
        StringBuilder nullStart = new StringBuilder(pathLength * 16);
        StringBuilder nullEnd = new StringBuilder(pathLength * 16);
        StringBuilder noNulls = new StringBuilder(pathLength * 16);


        // Iterate over each pair in the path and create the pattern string that
        // represents this path.  The pattern string is pos:rel:pos[,...] .
        DependencyTreeNode first = path.first();
        for (int i = 1; i < pathLength; ++i) {
            DependencyTreeNode second = path.getNode(i);
            // Check that the nodes weren't filtered out.  If so reject the path
            // even if the part of speech and relation text may have matched a
            // template.
            if (first.word().equals(IteratorFactory.EMPTY_TOKEN))
                return false;


            // Get the relation between the two nodes
            String rel = path.getRelation(i - 1);
            String firstPos = first.pos();
            String secPos = second.pos();


            // Check whether each POS has a class category to which it should be
            // mapped.  These classes are necessary to handle the singificant
            // number of variations for a general category of POS's, e.g. verb
            // -> VBZ, VBJ, etc., which were not present when the MINIPAR tags
            // were designed by Padó and Lapata.
            String class1 = POS_TAG_TO_CLASS.get(firstPos);
            String class2 = POS_TAG_TO_CLASS.get(secPos);
            
            if (class1 != null)
                firstPos = class1;
            if (class2 != null)
                secPos = class2;


            // Similarly, in order to handle the lex-mod relation, we check
            // whether the relation, e.g. PMOD, can be mapped to the general
            // lexical modifier class.
            String relClass = REL_TO_CLASS.get(rel);
            if (relClass != null)
                rel = relClass;
            
            // Create the three relation patterns by checking the current index
            // compared to the path length.
            nullStart.append((i == 1) ? "(null)" : firstPos);
            nullStart.append(":").append(rel).append(":").append(secPos);


            nullEnd.append(firstPos).append(":").append(rel).append(":");
            nullEnd.append((i + 1 == pathLength) ? "(null)" : secPos);


            noNulls.append(firstPos).append(":").append(rel)
                .append(":").append(secPos);


            // Check whether more elements existing, and if so, add the ','
            if (i + 1 < pathLength) {
                nullStart.append(",");
                nullEnd.append(",");
                noNulls.append(",");
            }


            // Last, shift over the node
            first = second;
        }


        // Extra case for the last token in the path
        if (first.word().equals(IteratorFactory.EMPTY_TOKEN))
            return false;


        boolean match = MEDIUM_TEMPLATES.contains(noNulls.toString())
            || MEDIUM_TEMPLATES.contains(nullStart.toString())
            || MEDIUM_TEMPLATES.contains(nullEnd.toString());
        
        return match;
    }


    /**
     * {@inheritDoc}
     */
    public int maxPathLength() {
        return 4;
    }
}
Source Code of edu.ucla.sspace.dv.MediumPennTemplateAcceptor

Related Classes of edu.ucla.sspace.dv.MediumPennTemplateAcceptor