Source Code of edu.ucla.sspace.gws.GenericWordSpace

/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.gws;


import edu.ucla.sspace.basis.BasisMapping;


import edu.ucla.sspace.common.Filterable;
import edu.ucla.sspace.common.DimensionallyInterpretableSemanticSpace;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.Duple;


import edu.ucla.sspace.vector.CompactSparseIntegerVector;
import edu.ucla.sspace.vector.IntegerVector;
import edu.ucla.sspace.vector.SparseIntegerVector;
import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;


import java.util.ArrayDeque;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;




/**
 * The most basic co-occurrence model that counts word co-occurrence within a
 * sliding window with no further processing.  This class is meant as a generic
 * model that can be used to measure the efficacy of other {@link SemanticSpace}
 * instances.
 *
 * <p> This class also provides for a slight variation on the basic model by
 * differentiating co-occurrences on the basis of their relative position to the
 * focus word.  In such a case, for example, an occurrence of "red" two before
 * the focus word would be represented by a different position than "red" one
 * position before.  This is reminiscent of the {@link
 * edu.ucla.sspace.ri.RandomIndexing RandomIndexing} model with permutations.
 * However, unlike Random Indexing, this model is not fixed in the number of
 * dimensions it may use, with a possible {@code numWords * windowSize * 2}
 * dimensions.  Such a large number of dimensions can negatively impact the
 * further operations on the semantic space's vectors, e.g., finding the most
 * similar vectors for a word.
 *
 * <p> The dimensions of this space are annotated with a description of what
 * they represent.  In the basic model, this will be the co-occurring word.  In
 * the model that takes into account word order, the description will include
 * the relative position of the word.
 *
 * <p> This class defines the following configurable properties that may be set
 * using either the System properties or using the {@link
 * GenericWordSpace#GenericWordSpace(Properties)} constructor.
 *
 * <dl style="margin-left: 1em">
 *
 * <dt> <i>Property:</i> <code><b>{@value #WINDOW_SIZE_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@value #DEFAULT_WINDOW_SIZE}
 *
 * <dd style="padding-top: .5em">This property sets the number of words before
 *      and after that are counted as co-occurring.  With the default value,
 *      {@value #DEFAULT_WINDOW_SIZE} words are counted before and {@value
 *      #DEFAULT_WINDOW_SIZE} words are counter after.  This class always uses a
 *      symmetric window. <p>
 *
 * <dt> <i>Property:</i> <code><b>{@value #USE_WORD_ORDER_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@code false}
 *
 * <dd style="padding-top: .5em">This property sets whether co-occurrences of
 *      the same word should be distinguished on the basis of their relative
 *      position to the focus word. <p>
 *
 * </dl> 
 *
 * <p> This class implements {@link Filterable}, which allows for fine-grained
 * control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
 * method can be used to speficy which words should have their semantics
 * retained.  Note that the words that are filtered out will still be used in
 * computing the semantics of <i>other</i> words.  This behavior is intended for
 * use with a large corpora where retaining the semantics of all words in memory
 * is infeasible.<p>
 *
 * This class is thread-safe for concurrent calls of {@link
 * #processDocument(BufferedReader) processDocument}.  At any given point in
  * processing, the {@link #getVector(String) getVector} method may be used
 * to access the current semantics of a word.  This allows callers to track
 * incremental changes to the semantics as the corpus is processed.  <p>
 *
 * The {@link #processSpace(Properties) processSpace} method does nothing for
 * this class and calls to it will not affect the results of {@code
 * getVector}.
 *
 * @author David Jurgens
 */
public class GenericWordSpace 
        implements DimensionallyInterpretableSemanticSpace<String>, Filterable, 
                   Serializable {


    private static final long serialVersionUID = 1L;


    public static final String GWS_SSPACE_NAME =
        "generic-word-space";


    /**
     * The prefix for naming public properties.
     */
    private static final String PROPERTY_PREFIX = 
        "edu.ucla.sspace.gws.GenericWordSpace";


    /**
     * The property to specify the number of words to view before and after each
     * word in focus.
     */
    public static final String WINDOW_SIZE_PROPERTY = 
        PROPERTY_PREFIX + ".windowSize";


    /**
     * The property to specify whether the relative positions of a word's
     * co-occurrence should be use distinguished from each other.
     */
    public static final String USE_WORD_ORDER_PROPERTY = 
        PROPERTY_PREFIX + ".useWordOrder";


    /**
     * The default number of words to view before and after each word in focus.
     */
    public static final int DEFAULT_WINDOW_SIZE = 2; // +2/-2
    
    /**
     * A mapping from each word to the vector the represents its semantics
     */
    private final Map<String,SparseIntegerVector> wordToSemantics;


    /**
     * The number of words to view before and after each focus word in a window.
     */
    private final int windowSize;


    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    private final Set<String> semanticFilter;


    /**
     * A mapping from a word an position to a specific dimension.  Note that if
     * word ordering is not being used, the dimension information is expected to
     * do nothing.
     */
    private final BasisMapping<Duple<String,Integer>,String> basisMapping;


    /**
     * Creates a new {@code GenericWordSpace} instance using the current {@code
     * System} properties for configuration.
     */
    public GenericWordSpace() {
        this(System.getProperties());
    }


    /**
     * Creates a new {@code GenericWordSpace} instance using the provided
     * properites for configuration.
     */
   public GenericWordSpace(Properties properties) {


        String windowSizeProp = properties.getProperty(WINDOW_SIZE_PROPERTY);
        windowSize = (windowSizeProp != null)
            ? Integer.parseInt(windowSizeProp)
            : DEFAULT_WINDOW_SIZE;


        String useWordOrderProp = 
            properties.getProperty(USE_WORD_ORDER_PROPERTY);
        boolean useWordOrder = (useWordOrderProp != null)
            ? Boolean.parseBoolean(useWordOrderProp)
            : false;


        basisMapping = (useWordOrder)
            ? new WordOrderBasisMapping()
            : new WordBasisMapping();        


        wordToSemantics = new HashMap<String,SparseIntegerVector>(1024, 4f);
        semanticFilter = new HashSet<String>();
    }


    /**
     * Creates a new {@code GenericWordSpace} with the provided window size that
     * ignores word order.
     */
   public GenericWordSpace(int windowSize) {
       this(windowSize, new WordBasisMapping());
    }


    /**
     * Creates a new {@code GenericWordSpace} with the provided window size that
     * optionally includes word order.
     */
   public GenericWordSpace(int windowSize, boolean useWordOrder) {
       this(windowSize, (useWordOrder)
            ? new WordOrderBasisMapping()
            : new WordBasisMapping());
   }


    /**
     * Creates a new {@code GenericWordSpace} with the provided window size that
     * uses the specified basis mapping to map each co-occurrence at a specified
     * position to a dimension.
     *
     * @param basis a basis mapping from a duple that represents a word and its
     *        relative position to a dimension.
     */
   public GenericWordSpace(int windowSize, 
                           BasisMapping<Duple<String,Integer>,String> basis) {
       this.windowSize = windowSize;
       this.basisMapping = basis;
       wordToSemantics = new HashMap<String,SparseIntegerVector>(1024, 4f);
       semanticFilter = new HashSet<String>();
   }


    /**
     * Removes all associations between word and semantics while still retaining
     * the words' basis mapping.  This method can be used to re-use the same
     * instance of a {@code GenericWordSpace} on multiple corpora while keeping
     * the semantics of the dimensions identical.
     */
    public void clearSemantics() {
        wordToSemantics.clear();
    }


    /**
     * Returns the current semantic vector for the provided word, or if the word
     * is not currently in the semantic space, a vector is added for it and
     * returned.
     *
     * @param word a word
     *
     * @return the {@code SemanticVector} for the provide word.
     */
    private SparseIntegerVector getSemanticVector(String word) {
        SparseIntegerVector v = wordToSemantics.get(word);
        if (v == null) {
            // lock on the word in case multiple threads attempt to add it at
            // once
            synchronized(this) {
                // recheck in case another thread added it while we were waiting
                // for the lock
                v = wordToSemantics.get(word);
                if (v == null) {
                    v = new CompactSparseIntegerVector(Integer.MAX_VALUE);
                    wordToSemantics.put(word, v);
                }
            }
        }
        return v;
    }


    /**
     * {@inheritDoc}
     */
    public String getDimensionDescription(int dimension) {
        return basisMapping.getDimensionDescription(dimension);
    }


   /**
     * {@inheritDoc} Note that because the word space is potentially growing as
     * new documents are processed, the length of the returned vector is equal
     * to the number of dimensions <i>at the time of this call</i> and therefore
     * may be less that the number of dimensions for the same word when obtained
     * at a later time.
     */ 
    public SparseIntegerVector getVector(String word) {
        SparseIntegerVector v = wordToSemantics.get(word);
        if (v == null) {
            return null;
        }
        // Note that because the word space is potentially ever growing, we wrap
        // the return vectors with the size of the semantic space at the time of
        // the call.
        return Vectors.immutable(Vectors.subview(v, 0, getVectorLength()));
    }


    /**
     * {@inheritDoc}
     */ 
    public String getSpaceName() {
        return GWS_SSPACE_NAME + "-w-" + windowSize
            + "-" + basisMapping;
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return basisMapping.numDimensions();
    }


    /**
     * {@inheritDoc}
     */ 
    public Set<String> getWords() {
        return Collections.unmodifiableSet(wordToSemantics.keySet());
    }
    
    /**
     * Updates the semantic vectors based on the words in the document.
     *
     * @param document {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        Queue<String> prevWords = new ArrayDeque<String>(windowSize);
        Queue<String> nextWords = new ArrayDeque<String>(windowSize);


        Iterator<String> documentTokens = 
            IteratorFactory.tokenizeOrdered(document);


        String focusWord = null;


        // prefetch the first windowSize words 
        for (int i = 0; i < windowSize && documentTokens.hasNext(); ++i)
            nextWords.offer(documentTokens.next());
        
        while (!nextWords.isEmpty()) {
            focusWord = nextWords.remove();


            // shift over the window to the next word
            if (documentTokens.hasNext()) {
                String windowEdge = documentTokens.next(); 
                nextWords.offer(windowEdge);
            }    


            // If we are filtering the semantic vectors, check whether this word
            // should have its semantics calculated.  In addition, if there is a
            // filter and it would have excluded the word, do not keep its
            // semantics around
            boolean calculateSemantics =
                (semanticFilter.isEmpty() || semanticFilter.contains(focusWord))
                && !focusWord.equals(IteratorFactory.EMPTY_TOKEN);
            
            if (calculateSemantics) {
                SparseIntegerVector focusSemantics = 
                    getSemanticVector(focusWord);


                // Keep track of the relative position of the focus word in case
                // word ordering is being used.
                int position = -prevWords.size(); // first word is furthest
                for (String word : prevWords) {
                    // Skip the addition of any words that are excluded from the
                    // filter set.  Note that by doing the exclusion here, we
                    // ensure that the token stream maintains its existing
                    // ordering, which is necessary when word order is taken
                    // into account.
                    if (word.equals(IteratorFactory.EMPTY_TOKEN)) {
                        position++;
                        continue;
                    }
                    
                    int dimension = basisMapping.getDimension(
                        new Duple<String,Integer>(word, position));
                    synchronized(focusSemantics) {
                        focusSemantics.add(dimension, 1);
                    }
                    position++;
                }
            
                // Repeat for the words in the forward window.
                position = 1;
                for (String word : nextWords) {
                    // Skip the addition of any words that are excluded from the
                    // filter set.  Note that by doing the exclusion here, we
                    // ensure that the token stream maintains its existing
                    // ordering, which is necessary when word order is taken
                    // into account.
                    if (word.equals(IteratorFactory.EMPTY_TOKEN)) {
                        ++position;
                        continue;
                    }
                
                    int dimension = basisMapping.getDimension(
                        new Duple<String,Integer>(word, position));
                    synchronized(focusSemantics) {
                        focusSemantics.add(dimension, 1);
                    }
                    position++;
                }
            }


            // Last put this focus word in the prev words and shift off the
            // front of the previous word window if it now contains more words
            // than the maximum window size
            prevWords.offer(focusWord);
            if (prevWords.size() > windowSize) {
                prevWords.remove();
            }
        }    


        document.close();
    }
    
    /**
     * Does nothing.
     *
     * @param properties {@inheritDoc}
     */
    public void processSpace(Properties properties) {
    }


    /**
     * {@inheritDoc} Note that all words will still have an index vector
     * assigned to them, which is necessary to properly compute the semantics.
     *
     * @param semanticsToRetain the set of words for which semantics should be
     *        computed.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        semanticFilter.clear();
        semanticFilter.addAll(semanticsToRetain);
    }


}
Source Code of edu.ucla.sspace.gws.GenericWordSpace

Related Classes of edu.ucla.sspace.gws.GenericWordSpace