Source Code of edu.ucla.sspace.svs.StructuredVectorSpace

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.svs;


import edu.ucla.sspace.common.SemanticSpace;


import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;


import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractorManager;
import edu.ucla.sspace.dependency.DependencyIterator;
import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyPathAcceptor;
import edu.ucla.sspace.dependency.DependencyPathWeight;
import edu.ucla.sspace.dependency.DependencyRelation;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.FilteredDependencyIterator;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.Pair;


import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.ScaledSparseDoubleVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;


import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Properties;
import java.util.Set;


import java.util.concurrent.ConcurrentHashMap;


import java.util.logging.Logger;




/**
 * A dependency parsed based approach to statistical semantics that uses a
 * collection of vectors to represent a word.  This implementaiton is based on
 * the following paper:
 *    
 *   <li style="font-family:Garamond, Georgia, serif">Katrin Erk and Sebastian
 *   Sebastian Padó, "A structured vector space model for word meaning in
 *   context," in <i>Annual Meeting of the ACL</i>, Honolulu, Hawaii.
 *   2008.</li>
 *
 * <p>
 *
 * This model requires a dependency parsed corpus.  When processing, three types
 * of vectors: word, which represnts the co-occureences word has with all other
 * tokens via a dependency chain; REL|word, which records the set of tokens that
 * govern the REL relationship with word; and word|REL, which records the set of
 * tokens that are governed by word in the REL relationship.  The first vector
 * is referred to as a lemma vector and the later two are called selectional
 * preference vectors.  In all cases REL is a dependency relationship.
 *
 * <p>
 *
 * This class implements {@link Filterable}, which allows for fine-grained
 * control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
 * method can be used to speficy which words should have their semantics
 * retained.  Note that the words that are filtered out will still be used in
 * computing the semantics of <i>other</i> words.  This behavior is intended for
 * use with a large corpora where retaining the semantics of all words in memory
 * is infeasible. 
 *
 * </p>
 *
 * This class is thread-safe for concurrent calls of {@link
 * #processDocument(BufferedReader) processDocument}.  At any given point in
 * processing, the {@link #getVectorFor(String) getVector} method may be used
 * to access the current semantics of a word.  This allows callers to track
 * incremental changes to the semantics as the corpus is processed. 
 *
 * </p>
 * The {@link #processSpace(Properties) processSpace} method does nothing other
 * than print out the feature indexes in the space to standard out.
 *
 * @author Keith Stevens
 */
public class StructuredVectorSpace implements SemanticSpace, Serializable {


    private static final long serialVersionUID = 1L;


    /**
     * The Semantic Space name for {@link StructuredVectorSpace}
     */
    public static final String SSPACE_NAME = 
        "structured-vector-space";


    /**
     * A static variable for the empty string.
     */
    public static final String EMPTY_STRING = "";


    /**
     * The logger used to record all output
     */
    private static final Logger LOG =
        Logger.getLogger(StructuredVectorSpace.class.getName());


    /**
     * A mapping from terms to dimensions in a co-occcurence space.
     */
    private final StringBasisMapping termBasis;


    /**
     * A mapping from terms to their lemma co-occurrence vectors.  These vectors
     * simply represent the number of times other words have occurrend with the
     * key word using any relation link with a distance of one relation.
     */
    private final Map<String, SelectionalPreference> preferenceVectors;


    /**
     * The {@link VectorCombinor} responsible for merging features between two
     * {@link SparseDoubleVector}s.  This is used when computing the relational
     * preference vectors for each word and for computing the contextualized
     * vectors.
     */
    private final VectorCombinor combinor;


    /**
     * A mapping for relation tuples (head word, relation, dependent word)
     * counting the number of times this relation has occurred in the corpus.
     * Only tuples where both words are accepted by the filter are stored.  In
     * order to eliminate duplicate  counting, each relation is only counted
     * once per headword observed, i.e. a sentence with cat as a headword of
     * food will create two dependency paths, one rooted at cat and one rooted
     * at food, this only records the data rooted at cat for this single
     * occurrence.
     */
    transient private Map<RelationTuple, SparseDoubleVector> relationVectors;


    /**
     * The {@link DependencyExtractor} being used for parsing corpora.
     */
    transient private final DependencyExtractor parser;


    /**
     * The {@link DependencyPathAcceptor} to use for validating paths.
     */
    transient private final DependencyPathAcceptor acceptor;


    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    transient private final Set<String> semanticFilter;


    /**
     * Create a new instance of {@code StructuredVectorSpace}.
     */
    public StructuredVectorSpace(DependencyExtractor extractor,
                                 DependencyPathAcceptor acceptor,
                                 VectorCombinor combinor) {
        this(extractor, acceptor, combinor,
             new StringBasisMapping(), new HashSet<String>());
    }


    /**
     * Create a new instance of {@code StructuredVectorSpace}.
     */
    public StructuredVectorSpace(DependencyExtractor extractor,
                                 DependencyPathAcceptor acceptor,
                                 VectorCombinor combinor,
                                 StringBasisMapping termBasis,
                                 Set<String> semanticFilter) {
        this.parser = extractor;
        this.acceptor = acceptor;
        this.combinor = combinor;
        this.termBasis = termBasis;
        this.semanticFilter = semanticFilter;


        preferenceVectors =
            new ConcurrentHashMap<String, SelectionalPreference>();
        relationVectors =
            new ConcurrentHashMap<RelationTuple, SparseDoubleVector>();
    }


    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return Collections.unmodifiableSet(preferenceVectors.keySet());
    }


    /**
     * {@inheritDoc}
     */
    public Vector getVector(String term) {
        SelectionalPreference preference = preferenceVectors.get(term);
        return (preference == null) ? null : preference.lemmaVector;
    }


    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return SSPACE_NAME; 
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return termBasis.numDimensions();
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        // Local maps to record occurrence counts.
        Map<Pair<String>,Double> localLemmaCounts = 
            new HashMap<Pair<String>,Double>();
        Map<RelationTuple, SparseDoubleVector> localTuples =
            new HashMap<RelationTuple, SparseDoubleVector>();


        // Iterate over all of the parseable dependency parsed sentences in the
        // document.
        for (DependencyTreeNode[] nodes = null;
                (nodes = parser.readNextTree(document)) != null; ) {


            // Skip empty documents.
            if (nodes.length == 0)
                continue;


            // Examine the paths for each word in the sentence.
            for (int i = 0; i < nodes.length; ++i) {
                // Reject words that are not nouns, verbs, or adjectives.
                if (!(nodes[i].pos().startsWith("N") ||
                      nodes[i].pos().startsWith("J") ||
                      nodes[i].pos().startsWith("V")))
                    continue;


                String focusWord = nodes[i].word();


                // Skip words that are rejected by the semantic filter.
                if (!acceptWord(focusWord))
                    continue;
                int focusIndex = termBasis.getDimension(focusWord);


                // Create the path iterator for all acceptable paths rooted at
                // the focus word in the sentence.
                Iterator<DependencyPath> pathIter = 
                    new FilteredDependencyIterator(nodes[i], acceptor, 1);


                while (pathIter.hasNext()) {
                    DependencyPath path = pathIter.next();
                    DependencyTreeNode last = path.last();


                    // Reject words that are not nouns, verbs, or adjectives.
                    if (!(last.pos().startsWith("N") ||
                          last.pos().startsWith("J") ||
                          last.pos().startsWith("V")))
                        continue;


                    // Get the feature index for the co-occurring word.
                    String otherTerm = last.word();
                    
                    // Skip any filtered features.
                    if (otherTerm.equals(EMPTY_STRING))
                        continue;


                    int featureIndex = termBasis.getDimension(otherTerm);


                    Pair<String> p = new Pair<String>(focusWord, otherTerm);
                    Double curCount = localLemmaCounts.get(p);
                    localLemmaCounts.put(p, (curCount == null)
                            ? 1 : 1 + curCount);


                    // Create a RelationTuple as a local key that records this
                    // relation tuple occurrence.  If there is not a local
                    // relation vector, create it.  Then add an occurrence count
                    // of 1.
                    DependencyRelation relation = path.iterator().next();


                    // Skip relations that do not have the focusWord as the
                    // head word in the relation.  The inverse relation will
                    // eventually be encountered and we'll account for it then.
                    if (!relation.headNode().word().equals(focusWord))
                        continue;


                    RelationTuple relationKey = new RelationTuple(
                            focusIndex, relation.relation().intern());
                    SparseDoubleVector relationVector = localTuples.get(
                            relationKey);
                    if (relationVector == null) {
                        relationVector = new CompactSparseVector();
                        localTuples.put(relationKey, relationVector);
                    }
                    relationVector.add(featureIndex, 1);
                }
            }
        }


        document.close();


        // Once the document has been processed, update the co-occurrence matrix
        // accordingly.
        for (Map.Entry<Pair<String>,Double> e : localLemmaCounts.entrySet()){
            // Push the local co-occurrence counts to the larger mapping.
            Pair<String> p = e.getKey();


            // Get the prefernce vectors for the current focus word.  If they do
            // not exist, create it in a thread safe manner.
            SelectionalPreference preference = preferenceVectors.get(p.x);
            if (preference == null) {
                synchronized (this) {
                    preference = preferenceVectors.get(p.x);
                    if (preference == null) {
                        preference = new SelectionalPreference(combinor);
                        preferenceVectors.put(p.x, preference);
                    }
                }
            }
            // Add the local count.
            synchronized (preference) {
                preference.lemmaVector.add(
                        termBasis.getDimension(p.y), e.getValue());
            }
        }


        // Push the relation tuple counts to the larger counts.
        for (Map.Entry<RelationTuple, SparseDoubleVector> r : 
                localTuples.entrySet()) {
            // Get the global counts for this relation tuple.  If it does not
            // exist, create a new one in a thread safe manner.
            SparseDoubleVector relationCounts = relationVectors.get(r.getKey());
            if (relationCounts == null) {
                synchronized (this) {
                    relationCounts = relationVectors.get(r.getKey());
                    if (relationCounts == null) {
                        relationCounts = new CompactSparseVector();
                        relationVectors.put(r.getKey(), relationCounts);
                    }
                }
            }


            // Update the counts.
            synchronized (relationCounts) {
                VectorMath.add(relationCounts, r.getValue());
            }
        }
    }


    /**
     * {@inheritDoc}
     */
    public void processSpace(Properties properties) {
        SparseDoubleVector empty = new CompactSparseVector();
        for (Map.Entry<RelationTuple, SparseDoubleVector> e :
                relationVectors.entrySet()) {
            RelationTuple relation = e.getKey();
            SparseDoubleVector relationCounts = e.getValue();
            String headWord = termBasis.getDimensionDescription(relation.head);
            String rel = relation.relation;


            SelectionalPreference headPref = preferenceVectors.get(headWord);


            if (headPref == null)
                LOG.fine("what the fuck");


            for (int index : relationCounts.getNonZeroIndices()) {
                double frequency = relationCounts.get(index);
                String depWord = termBasis.getDimensionDescription(index);
                SelectionalPreference depPref = preferenceVectors.get(depWord);


                // It's possible that the dependent word is not being
                // represented in this space, so skip missing terms.
                if (depPref == null)
                    continue;


                headPref.addPreference(
                        rel, depPref.lemmaVector, frequency);
                depPref.addInversePreference(
                        rel, headPref.lemmaVector, frequency);
            }
            e.setValue(empty);
        }


        // Null out all the relation tuple counts so that memory can be
        // freed up.
        relationVectors = null;
    }


    public SparseDoubleVector contextualize(String focusWord, 
                                            String relation,
                                            String secondWord,
                                            boolean isFocusHeadWord) {
        SelectionalPreference focusPref = preferenceVectors.get(focusWord);
        SelectionalPreference secondPref = preferenceVectors.get(secondWord);


        if (focusPref == null)
            return null;
        if (secondPref == null)
            return focusPref.lemmaVector;


        if (isFocusHeadWord)
            return combinor.combineUnmodified(
                    focusPref.lemmaVector, 
                    secondPref.inversePreference(relation));
        return combinor.combineUnmodified(focusPref.lemmaVector, 
                                          secondPref.preference(relation));
    }


    /**
     * {@inheritDoc}.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        semanticFilter.clear();
        semanticFilter.addAll(semanticsToRetain);
    }


    /**
     * Returns true if there is no semantic filter list or the word is in the
     * filter list.
     */
    private boolean acceptWord(String word) {
        return !word.equals(EMPTY_STRING) && 
               (semanticFilter.isEmpty() || semanticFilter.contains(word));
    }
}
Source Code of edu.ucla.sspace.svs.StructuredVectorSpace

Related Classes of edu.ucla.sspace.svs.StructuredVectorSpace