Source Code of edu.ucla.sspace.dri.DependencyRandomIndexing

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.dri;


import edu.ucla.sspace.common.SemanticSpace;


import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractorManager;
import edu.ucla.sspace.dependency.DependencyIterator;
import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyRelationAcceptor;
import edu.ucla.sspace.dependency.DependencyPathWeight;
import edu.ucla.sspace.dependency.DependencyPermutationFunction;
import edu.ucla.sspace.dependency.DependencyRelation;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.FlatPathWeight;
import edu.ucla.sspace.dependency.UniversalRelationAcceptor;


import edu.ucla.sspace.index.PermutationFunction;
import edu.ucla.sspace.index.RandomIndexVectorGenerator;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.Generator;
import edu.ucla.sspace.util.GeneratorMap;
import edu.ucla.sspace.util.ReflectionUtil;


import edu.ucla.sspace.vector.CompactSparseIntegerVector;
import edu.ucla.sspace.vector.IntegerVector;
import edu.ucla.sspace.vector.SparseIntegerVector;
import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;


import java.io.BufferedReader;
import java.io.IOException;


import java.util.ArrayDeque;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Random;
import java.util.Set;


import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;


import java.util.logging.Logger;




/**
 * A co-occurrence based approach to statistical semantics that uses dependency
 * parse trees and approximates a full co-occurrence matrix by using a
 * randomized projection. This implementation is an extension of {@link
 * edu.ucla.sspace.ri.RandomIndexing}, which is based on three papers: <ul>
 *
 *   <li style="font-family:Garamond, Georgia, serif">M. Sahlgren, "Vector-based
 *     semantic analysis: Representing word meanings based on random labels," in
 *     <i>Proceedings of the ESSLLI 2001 Workshop on Semantic Knowledge
 *     Acquisition and Categorisation</i>, Helsinki, Finland, 2001.</li>
 *
 *   <li style="font-family:Garamond, Georgia, serif">M. Sahlgren, "An
 *     introduction to random indexing," in <i>Proceedings of the Methods and
 *     Applicatons of Semantic Indexing Workshop at the 7th International
 *     Conference on Terminology and Knowledge Engineering</i>, 2005.</li>
 *
 *   <li style="font-family:Garamond, Georgia, serif">M. Sahlgren, A. Holst, and
 *     P. Kanerva, "Permutations as a means to encode order in word space," in
 *     <i>Proceedings of the 30th Annual Meeting of the Cognitive Science
 *     Society (CogSci’08)</i>, 2008.</li>
 *
 * </ul>
 *
 * </p>
 *
 * The technique for incorprating dependnecy parse trees is based on the following paper:
 *   <li style="font-family:Garamond, Georgia, serif">S Pado and M. Lapata,
 *   "Dependency-Based Construction of Semantic Space Models," in <i>Association
 *   for Computational Linguistics</i>, 2007</li>
 *
 * <p>
 *
 * Dependency Random Indexing (DRI) extends Random Indexing by restricting a
 * word's context to be set of words with which it has a syntactic relationship.
 * Full word co-occurrence models have shown that this restricted interpretation
 * of a context can improve the semantic representations.  DRI uses the same
 * approximation technique as Random Indexing to project this full co-occurrence
 * space into a significantly smaller dimensional space.  This projection is
 * done through use of index vectors, each of which are sparse and mostly
 * orthogonal to all other index vectors.  The summation of a word's index
 * vectors corresponds directly to that word's occurrence in a context.
 *
 * <p> 
 *
 * While Random Indexing uses permutations of these index vectors to encode
 * lexical position, a shallow form of syntactic structure, DRI extends the
 * notion of permutations to allow for the encoding of dependency relationships.
 * Through this modification, the set of relationships between any two
 * co-occurirng words in a sentence can be encoded, as can the distance between
 * the two words.  Under this model, each possible dependency relationship could
 * have it's own permutation function, as could each possible distance between
 * co-occurring words.
 *
 * </p>
 *
 * This class defines the following configurable properties that may be set
 * using either the System properties or using the {@link
 * DependencyRandomIndexing#DependencyRandomIndexing(
 * DependencyExtractor, DependencyPermutationFunction, Properties)} constructor.
 *
 * <dl style="margin-left: 1em">
 *
 * <dt> <i>Property:</i> <code><b>{@value #DEPENDENCY_ACCEPTOR_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@link UniversalRelationAcceptor}
 *
 * <dd style="padding-top: .5em">This property sets {@link
 *      DependencyRelationAcceptor} to use for validating dependency paths.  If a
 *      path is rejected it will not influence either the lemma vector or the
 *      selectional preference vectors. </p>
 *
 * <dt> <i>Property:</i> <code><b>{@value #DEPENDENCY_PATH_LENGTH_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@value DEFAULT_DEPENDENCY_PATH_LENGTH}
 *
 * <dd style="padding-top: .5em">This property sets the maximal length a
 *      dependency path can be for it to be accepted.  Paths beyond this length
 *      will not contribute towards either the lemma vectors or selectional
 *      preference vectors. </p>
 *
 * <dt> <i>Property:</i> <code><b>{@value #VECTOR_LENGTH_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@link DEFAULT_VECTOR_LENGTH}
 *
 * <dd style="padding-top: .5em">This property sets the number of dimensions in
 *      the word space.
 *
 * </dl>
 *
 * </p>
 *
 * This class implements {@link Filterable}, which allows for fine-grained
 * control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
 * method can be used to speficy which words should have their semantics
 * retained.  Note that the words that are filtered out will still be used in
 * computing the semantics of <i>other</i> words.  This behavior is intended for
 * use with a large corpora where retaining the semantics of all words in memory
 * is infeasible.
 *
 * </p>
 *
 * This class is thread-safe for concurrent calls of {@link
 * #processDocument(BufferedReader) processDocument}.  At any given point in
 * processing, the {@link #getVectorFor(String) getVector} method may be used
 * to access the current semantics of a word.  This allows callers to track
 * incremental changes to the semantics as the corpus is processed.
 *
 * </p>
 *
 * The {@link #processSpace(Properties) processSpace} method does nothing for
 * this class and calls to it will not affect the results of {@code
 * getVectorFor}.
 *
 * @see RandomIndexing
 * @see DependencyPermutationFunction
 *
 * @author Keith Stevens
 */
public class DependencyRandomIndexing implements SemanticSpace {


    /**
     * The base prefix for all {@code DependencyRandomIndexing}
     * properties.
     */
    public static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.dri.DependencyRandomIndexing";


    /**
     * The property for setting the number of dimensions in the word space.
     */
    public static final String VECTOR_LENGTH_PROPERTY =
        PROPERTY_PREFIX + ".indexVectorLength";


    /**
     * The property for setting the {@link DependencyRelationAcceptor}.
     */
    public static final String DEPENDENCY_ACCEPTOR_PROPERTY =
        PROPERTY_PREFIX + ".dependencyAcceptor";


    /**
     * The property for setting the maximal length of any {@link
     * DependencyPath}.
     */
    public static final String DEPENDENCY_PATH_LENGTH_PROPERTY =
        PROPERTY_PREFIX + ".dependencyPathLength";


    /**
     * The default vector length.
     */
    public static final int DEFAULT_VECTOR_LENGTH = 50000;


    /**
     * The default legnth a dependency path may have.
     */
    public static final int DEFAULT_DEPENDENCY_PATH_LENGTH = Integer.MAX_VALUE;


    /**
     * The Semantic Space name for {@link DependencyRandomIndexing}
     */
    public static final String SSPACE_NAME = 
        "dependency-random-indexing";


    /**
     * The logger used to record all output
     */
    private static final Logger LOGGER =
        Logger.getLogger(DependencyRandomIndexing.class.getName());


    /**
     * A mapping from strings to {@code IntegerVector}s which represent an index
     * vector.
     */
    private Map<String, TernaryVector> indexMap;


    /**
     * The {@code PermutationFunction} to use for co-occurrances.
     */
    private final DependencyPermutationFunction<TernaryVector> permFunc;


    /**
     * A map that represents the word space by mapping raw strings to vectors.
     */
    private ConcurrentMap<String, IntegerVector> wordSpace;


    /**
     * The size of each index vector, as set when the sspace is created.
     */
    private final int vectorLength;


    /**
     * The {@link DependencyExtractor} being used for parsing corpora.
     */
    private final DependencyExtractor parser;


    /**
     * The {@link DependencyRelationAcceptor} to use for validating paths.
     */
    private final DependencyRelationAcceptor acceptor;


    /**
     * The maximum number of relations any path may have.
     */
    private final int pathLength;


    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    private Set<String> semanticFilter;


    /**
     * Creates a new instance of {@code DependencyRandomIndexing} that takes
     * ownership of a {@link DependencyExtractor} and uses the System provided
     * properties to specify other class objects.
     */
    public DependencyRandomIndexing(
            DependencyPermutationFunction<TernaryVector> permFunc) {
        this(permFunc, System.getProperties());
    }


    /**
     * Create a new instance of {@code DependencyRandomIndexing} which
     * takes ownership
     */
    public DependencyRandomIndexing(
            DependencyPermutationFunction<TernaryVector> permFunc,
            Properties properties) {
        this.permFunc = permFunc;
        this.parser = DependencyExtractorManager.getDefaultExtractor();


        // Load the vector length.
        String vectorLengthProp = 
            properties.getProperty(VECTOR_LENGTH_PROPERTY);
        vectorLength = (vectorLengthProp != null)
            ? Integer.parseInt(vectorLengthProp)
            : DEFAULT_VECTOR_LENGTH;


        // Load the maximum dependency path length.
        String pathLengthProp =
            properties.getProperty(DEPENDENCY_PATH_LENGTH_PROPERTY);
        pathLength = (pathLengthProp != null)
            ? Integer.parseInt(pathLengthProp)
            : DEFAULT_DEPENDENCY_PATH_LENGTH;


        // Load the path acceptor.
        String acceptorProp = 
            properties.getProperty(DEPENDENCY_ACCEPTOR_PROPERTY);
        acceptor = (acceptorProp != null)
            ? (DependencyRelationAcceptor) 
                ReflectionUtil.getObjectInstance(acceptorProp)
            : new UniversalRelationAcceptor();


        // Set up the generator vector maps.
        RandomIndexVectorGenerator indexVectorGenerator = 
            new RandomIndexVectorGenerator(vectorLength, properties);
        indexMap = new GeneratorMap<TernaryVector>(indexVectorGenerator);
        wordSpace = new ConcurrentHashMap<String,IntegerVector>();
        semanticFilter = new HashSet<String>();
    }


    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return Collections.unmodifiableSet(wordSpace.keySet());
    }


    /**
     * {@inheritDoc}
     */
    public Vector getVector(String term) {
        return Vectors.immutable(wordSpace.get(term));
    }


    public DependencyPermutationFunction<TernaryVector> getPermutations() {
        return permFunc;
    }


    public Map<String, TernaryVector> getWordToVectorMap() {
        return indexMap;
    }


    public void setWordToVectorMap(Map<String, TernaryVector> vectorMap) {
        indexMap = vectorMap;
    }


    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return SSPACE_NAME + "-" + vectorLength;
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return vectorLength;
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        // Iterate over all of the parseable dependency parsed sentences in the
        // document.
        for (DependencyTreeNode[] nodes = null;
                (nodes = parser.readNextTree(document)) != null; ) {


            // Skip empty documents.
            if (nodes.length == 0)
                continue;


            // Examine the paths for each word in the sentence.
            for (int i = 0; i < nodes.length; ++i) {
                String focusWord = nodes[i].word();


                // Skip words that are rejected by the semantic filter.
                if (!acceptWord(focusWord))
                    continue;


                // Acquire the semantic vector for the focus word.
                IntegerVector focusMeaning = getSemanticVector(focusWord);


                // Create the path iterator for all acceptable paths rooted at
                // the focus word in the sentence.
                Iterator<DependencyPath> pathIter = 
                    new DependencyIterator(nodes[i], acceptor, pathLength);


                // For every path, obtain the index vector of the last word in
                // the path and add it to the semantic vector for the focus
                // word.  The index vector is permuted if a permutation
                // function has been provided based on the contents of the path.
                while (pathIter.hasNext()) {
                    DependencyPath path = pathIter.next();
                    TernaryVector termVector = indexMap.get(path.last().word());
                    if (permFunc != null)
                        termVector = permFunc.permute(termVector, path);
                    add(focusMeaning, termVector);
                }
            }
        }
        document.close();
    }
        
    /**
     * Does nothing.
     *
     * @param properties {@inheritDoc}
     */
    public void processSpace(Properties properties) {
    }


    /**
     * Assigns the word to {@link IntegerVector} mapping to be used by this
     * instance.  This instance takes ownership of the passed in map.
     *
     * @param m a mapping from token to the {@code IntegerVector} that should be
     *        used represent it when calculating other word's semantics
     */
    public void setWordToIndexVector(Map<String,TernaryVector> m) {
        indexMap = m;
    }


    /**
     * {@inheritDoc}.
     *
     * </p> Note that all words will still have an index vector assigned to
     * them, which is necessary to properly compute the semantics.
     *
     * @param semanticsToRetain the set of words for which semantics should be
     *        computed.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        semanticFilter.clear();
        semanticFilter.addAll(semanticsToRetain);
    }


    /**
     * Returns true if there is no semantic filter list or the word is in the
     * filter list.
     */
    private boolean acceptWord(String word) {
        return semanticFilter.isEmpty() || semanticFilter.contains(word);
    }


    /**
     * Atomically adds the values of the index vector to the semantic vector.
     * This is a special case addition operation that only iterates over the
     * non-zero values of the index vector.
     */
    private static void add(IntegerVector semantics, TernaryVector index) {
        // Lock on the semantic vector to avoid a race condition with another
        // thread updating its semantics.  Use the vector to avoid a class-level
        // lock, which would limit the concurrency.
        synchronized(semantics) {
            for (int p : index.positiveDimensions())
                semantics.add(p, 1);
            for (int n : index.negativeDimensions())
                semantics.add(n, -1);
        }
    }


     /**
     * Returns the current semantic vector for the provided word.  If the word
     * is not currently in the semantic space, a vector is added for it and
     * returned.
     *
     * @param word a word that requires a semantic vector
     *
     * @return the {@code SemanticVector} representing {@code word}
     */
    private IntegerVector getSemanticVector(String word) {
        IntegerVector v = wordSpace.get(word);
        if (v == null) {
            // lock on the word in case multiple threads attempt to add it at
            // once
            synchronized(this) {
                // recheck in case another thread added it while we were waiting
                // for the lock
                v = wordSpace.get(word);
                if (v == null) {
                    v = new CompactSparseIntegerVector(vectorLength);
                    wordSpace.put(word, v);
                }
            }
        }
        return v;
    }
}
Source Code of edu.ucla.sspace.dri.DependencyRandomIndexing

Related Classes of edu.ucla.sspace.dri.DependencyRandomIndexing