Package edu.ucla.sspace.svs

Source Code of edu.ucla.sspace.svs.StructuredVectorSpace

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.svs;

import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;

import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractorManager;
import edu.ucla.sspace.dependency.DependencyIterator;
import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyPathAcceptor;
import edu.ucla.sspace.dependency.DependencyPathWeight;
import edu.ucla.sspace.dependency.DependencyRelation;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.FilteredDependencyIterator;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.ScaledSparseDoubleVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;

import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import java.util.concurrent.ConcurrentHashMap;

import java.util.logging.Logger;


/**
* A dependency parsed based approach to statistical semantics that uses a
* collection of vectors to represent a word.  This implementaiton is based on
* the following paper:
*   
*   <li style="font-family:Garamond, Georgia, serif">Katrin Erk and Sebastian
*   Sebastian Padó, "A structured vector space model for word meaning in
*   context," in <i>Annual Meeting of the ACL</i>, Honolulu, Hawaii.
*   2008.</li>
*
* <p>
*
* This model requires a dependency parsed corpus.  When processing, three types
* of vectors: word, which represnts the co-occureences word has with all other
* tokens via a dependency chain; REL|word, which records the set of tokens that
* govern the REL relationship with word; and word|REL, which records the set of
* tokens that are governed by word in the REL relationship.  The first vector
* is referred to as a lemma vector and the later two are called selectional
* preference vectors.  In all cases REL is a dependency relationship.
*
* <p>
*
* This class implements {@link Filterable}, which allows for fine-grained
* control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
* method can be used to speficy which words should have their semantics
* retained.  Note that the words that are filtered out will still be used in
* computing the semantics of <i>other</i> words.  This behavior is intended for
* use with a large corpora where retaining the semantics of all words in memory
* is infeasible.
*
* </p>
*
* This class is thread-safe for concurrent calls of {@link
* #processDocument(BufferedReader) processDocument}.  At any given point in
* processing, the {@link #getVectorFor(String) getVector} method may be used
* to access the current semantics of a word.  This allows callers to track
* incremental changes to the semantics as the corpus is processed.
*
* </p>
* The {@link #processSpace(Properties) processSpace} method does nothing other
* than print out the feature indexes in the space to standard out.
*
* @author Keith Stevens
*/
public class StructuredVectorSpace implements SemanticSpace, Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * The Semantic Space name for {@link StructuredVectorSpace}
     */
    public static final String SSPACE_NAME =
        "structured-vector-space";

    /**
     * A static variable for the empty string.
     */
    public static final String EMPTY_STRING = "";

    /**
     * The logger used to record all output
     */
    private static final Logger LOG =
        Logger.getLogger(StructuredVectorSpace.class.getName());

    /**
     * A mapping from terms to dimensions in a co-occcurence space.
     */
    private final StringBasisMapping termBasis;

    /**
     * A mapping from terms to their lemma co-occurrence vectors.  These vectors
     * simply represent the number of times other words have occurrend with the
     * key word using any relation link with a distance of one relation.
     */
    private final Map<String, SelectionalPreference> preferenceVectors;

    /**
     * The {@link VectorCombinor} responsible for merging features between two
     * {@link SparseDoubleVector}s.  This is used when computing the relational
     * preference vectors for each word and for computing the contextualized
     * vectors.
     */
    private final VectorCombinor combinor;

    /**
     * A mapping for relation tuples (head word, relation, dependent word)
     * counting the number of times this relation has occurred in the corpus.
     * Only tuples where both words are accepted by the filter are stored.  In
     * order to eliminate duplicate  counting, each relation is only counted
     * once per headword observed, i.e. a sentence with cat as a headword of
     * food will create two dependency paths, one rooted at cat and one rooted
     * at food, this only records the data rooted at cat for this single
     * occurrence.
     */
    transient private Map<RelationTuple, SparseDoubleVector> relationVectors;

    /**
     * The {@link DependencyExtractor} being used for parsing corpora.
     */
    transient private final DependencyExtractor parser;

    /**
     * The {@link DependencyPathAcceptor} to use for validating paths.
     */
    transient private final DependencyPathAcceptor acceptor;

    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    transient private final Set<String> semanticFilter;

    /**
     * Create a new instance of {@code StructuredVectorSpace}.
     */
    public StructuredVectorSpace(DependencyExtractor extractor,
                                 DependencyPathAcceptor acceptor,
                                 VectorCombinor combinor) {
        this(extractor, acceptor, combinor,
             new StringBasisMapping(), new HashSet<String>());
    }

    /**
     * Create a new instance of {@code StructuredVectorSpace}.
     */
    public StructuredVectorSpace(DependencyExtractor extractor,
                                 DependencyPathAcceptor acceptor,
                                 VectorCombinor combinor,
                                 StringBasisMapping termBasis,
                                 Set<String> semanticFilter) {
        this.parser = extractor;
        this.acceptor = acceptor;
        this.combinor = combinor;
        this.termBasis = termBasis;
        this.semanticFilter = semanticFilter;

        preferenceVectors =
            new ConcurrentHashMap<String, SelectionalPreference>();
        relationVectors =
            new ConcurrentHashMap<RelationTuple, SparseDoubleVector>();
    }

    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return Collections.unmodifiableSet(preferenceVectors.keySet());
    }

    /**
     * {@inheritDoc}
     */
    public Vector getVector(String term) {
        SelectionalPreference preference = preferenceVectors.get(term);
        return (preference == null) ? null : preference.lemmaVector;
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return SSPACE_NAME;
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return termBasis.numDimensions();
    }

    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        // Local maps to record occurrence counts.
        Map<Pair<String>,Double> localLemmaCounts =
            new HashMap<Pair<String>,Double>();
        Map<RelationTuple, SparseDoubleVector> localTuples =
            new HashMap<RelationTuple, SparseDoubleVector>();

        // Iterate over all of the parseable dependency parsed sentences in the
        // document.
        for (DependencyTreeNode[] nodes = null;
                (nodes = parser.readNextTree(document)) != null; ) {

            // Skip empty documents.
            if (nodes.length == 0)
                continue;

            // Examine the paths for each word in the sentence.
            for (int i = 0; i < nodes.length; ++i) {
                // Reject words that are not nouns, verbs, or adjectives.
                if (!(nodes[i].pos().startsWith("N") ||
                      nodes[i].pos().startsWith("J") ||
                      nodes[i].pos().startsWith("V")))
                    continue;

                String focusWord = nodes[i].word();

                // Skip words that are rejected by the semantic filter.
                if (!acceptWord(focusWord))
                    continue;
                int focusIndex = termBasis.getDimension(focusWord);

                // Create the path iterator for all acceptable paths rooted at
                // the focus word in the sentence.
                Iterator<DependencyPath> pathIter =
                    new FilteredDependencyIterator(nodes[i], acceptor, 1);

                while (pathIter.hasNext()) {
                    DependencyPath path = pathIter.next();
                    DependencyTreeNode last = path.last();

                    // Reject words that are not nouns, verbs, or adjectives.
                    if (!(last.pos().startsWith("N") ||
                          last.pos().startsWith("J") ||
                          last.pos().startsWith("V")))
                        continue;

                    // Get the feature index for the co-occurring word.
                    String otherTerm = last.word();
                   
                    // Skip any filtered features.
                    if (otherTerm.equals(EMPTY_STRING))
                        continue;

                    int featureIndex = termBasis.getDimension(otherTerm);

                    Pair<String> p = new Pair<String>(focusWord, otherTerm);
                    Double curCount = localLemmaCounts.get(p);
                    localLemmaCounts.put(p, (curCount == null)
                            ? 1 : 1 + curCount);

                    // Create a RelationTuple as a local key that records this
                    // relation tuple occurrence.  If there is not a local
                    // relation vector, create it.  Then add an occurrence count
                    // of 1.
                    DependencyRelation relation = path.iterator().next();

                    // Skip relations that do not have the focusWord as the
                    // head word in the relation.  The inverse relation will
                    // eventually be encountered and we'll account for it then.
                    if (!relation.headNode().word().equals(focusWord))
                        continue;

                    RelationTuple relationKey = new RelationTuple(
                            focusIndex, relation.relation().intern());
                    SparseDoubleVector relationVector = localTuples.get(
                            relationKey);
                    if (relationVector == null) {
                        relationVector = new CompactSparseVector();
                        localTuples.put(relationKey, relationVector);
                    }
                    relationVector.add(featureIndex, 1);
                }
            }
        }

        document.close();

        // Once the document has been processed, update the co-occurrence matrix
        // accordingly.
        for (Map.Entry<Pair<String>,Double> e : localLemmaCounts.entrySet()){
            // Push the local co-occurrence counts to the larger mapping.
            Pair<String> p = e.getKey();

            // Get the prefernce vectors for the current focus word.  If they do
            // not exist, create it in a thread safe manner.
            SelectionalPreference preference = preferenceVectors.get(p.x);
            if (preference == null) {
                synchronized (this) {
                    preference = preferenceVectors.get(p.x);
                    if (preference == null) {
                        preference = new SelectionalPreference(combinor);
                        preferenceVectors.put(p.x, preference);
                    }
                }
            }
            // Add the local count.
            synchronized (preference) {
                preference.lemmaVector.add(
                        termBasis.getDimension(p.y), e.getValue());
            }
        }

        // Push the relation tuple counts to the larger counts.
        for (Map.Entry<RelationTuple, SparseDoubleVector> r :
                localTuples.entrySet()) {
            // Get the global counts for this relation tuple.  If it does not
            // exist, create a new one in a thread safe manner.
            SparseDoubleVector relationCounts = relationVectors.get(r.getKey());
            if (relationCounts == null) {
                synchronized (this) {
                    relationCounts = relationVectors.get(r.getKey());
                    if (relationCounts == null) {
                        relationCounts = new CompactSparseVector();
                        relationVectors.put(r.getKey(), relationCounts);
                    }
                }
            }

            // Update the counts.
            synchronized (relationCounts) {
                VectorMath.add(relationCounts, r.getValue());
            }
        }
    }

    /**
     * {@inheritDoc}
     */
    public void processSpace(Properties properties) {
        SparseDoubleVector empty = new CompactSparseVector();
        for (Map.Entry<RelationTuple, SparseDoubleVector> e :
                relationVectors.entrySet()) {
            RelationTuple relation = e.getKey();
            SparseDoubleVector relationCounts = e.getValue();
            String headWord = termBasis.getDimensionDescription(relation.head);
            String rel = relation.relation;

            SelectionalPreference headPref = preferenceVectors.get(headWord);

            if (headPref == null)
                LOG.fine("what the fuck");

            for (int index : relationCounts.getNonZeroIndices()) {
                double frequency = relationCounts.get(index);
                String depWord = termBasis.getDimensionDescription(index);
                SelectionalPreference depPref = preferenceVectors.get(depWord);

                // It's possible that the dependent word is not being
                // represented in this space, so skip missing terms.
                if (depPref == null)
                    continue;

                headPref.addPreference(
                        rel, depPref.lemmaVector, frequency);
                depPref.addInversePreference(
                        rel, headPref.lemmaVector, frequency);
            }
            e.setValue(empty);
        }

        // Null out all the relation tuple counts so that memory can be
        // freed up.
        relationVectors = null;
    }

    public SparseDoubleVector contextualize(String focusWord,
                                            String relation,
                                            String secondWord,
                                            boolean isFocusHeadWord) {
        SelectionalPreference focusPref = preferenceVectors.get(focusWord);
        SelectionalPreference secondPref = preferenceVectors.get(secondWord);

        if (focusPref == null)
            return null;
        if (secondPref == null)
            return focusPref.lemmaVector;

        if (isFocusHeadWord)
            return combinor.combineUnmodified(
                    focusPref.lemmaVector,
                    secondPref.inversePreference(relation));
        return combinor.combineUnmodified(focusPref.lemmaVector,
                                          secondPref.preference(relation));
    }

    /**
     * {@inheritDoc}.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        semanticFilter.clear();
        semanticFilter.addAll(semanticsToRetain);
    }

    /**
     * Returns true if there is no semantic filter list or the word is in the
     * filter list.
     */
    private boolean acceptWord(String word) {
        return !word.equals(EMPTY_STRING) &&
               (semanticFilter.isEmpty() || semanticFilter.contains(word));
    }
}
TOP

Related Classes of edu.ucla.sspace.svs.StructuredVectorSpace

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.