Package edu.ucla.sspace.gws

Source Code of edu.ucla.sspace.gws.GenericWordSpace

/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.gws;

import edu.ucla.sspace.basis.BasisMapping;

import edu.ucla.sspace.common.Filterable;
import edu.ucla.sspace.common.DimensionallyInterpretableSemanticSpace;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.Duple;

import edu.ucla.sspace.vector.CompactSparseIntegerVector;
import edu.ucla.sspace.vector.IntegerVector;
import edu.ucla.sspace.vector.SparseIntegerVector;
import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;

import java.util.ArrayDeque;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;


/**
* The most basic co-occurrence model that counts word co-occurrence within a
* sliding window with no further processing.  This class is meant as a generic
* model that can be used to measure the efficacy of other {@link SemanticSpace}
* instances.
*
* <p> This class also provides for a slight variation on the basic model by
* differentiating co-occurrences on the basis of their relative position to the
* focus word.  In such a case, for example, an occurrence of "red" two before
* the focus word would be represented by a different position than "red" one
* position before.  This is reminiscent of the {@link
* edu.ucla.sspace.ri.RandomIndexing RandomIndexing} model with permutations.
* However, unlike Random Indexing, this model is not fixed in the number of
* dimensions it may use, with a possible {@code numWords * windowSize * 2}
* dimensions.  Such a large number of dimensions can negatively impact the
* further operations on the semantic space's vectors, e.g., finding the most
* similar vectors for a word.
*
* <p> The dimensions of this space are annotated with a description of what
* they represent.  In the basic model, this will be the co-occurring word.  In
* the model that takes into account word order, the description will include
* the relative position of the word.
*
* <p> This class defines the following configurable properties that may be set
* using either the System properties or using the {@link
* GenericWordSpace#GenericWordSpace(Properties)} constructor.
*
* <dl style="margin-left: 1em">
*
* <dt> <i>Property:</i> <code><b>{@value #WINDOW_SIZE_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@value #DEFAULT_WINDOW_SIZE}
*
* <dd style="padding-top: .5em">This property sets the number of words before
*      and after that are counted as co-occurring.  With the default value,
*      {@value #DEFAULT_WINDOW_SIZE} words are counted before and {@value
*      #DEFAULT_WINDOW_SIZE} words are counter after.  This class always uses a
*      symmetric window. <p>
*
* <dt> <i>Property:</i> <code><b>{@value #USE_WORD_ORDER_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@code false}
*
* <dd style="padding-top: .5em">This property sets whether co-occurrences of
*      the same word should be distinguished on the basis of their relative
*      position to the focus word. <p>
*
* </dl>
*
* <p> This class implements {@link Filterable}, which allows for fine-grained
* control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
* method can be used to speficy which words should have their semantics
* retained.  Note that the words that are filtered out will still be used in
* computing the semantics of <i>other</i> words.  This behavior is intended for
* use with a large corpora where retaining the semantics of all words in memory
* is infeasible.<p>
*
* This class is thread-safe for concurrent calls of {@link
* #processDocument(BufferedReader) processDocument}.  At any given point in
  * processing, the {@link #getVector(String) getVector} method may be used
* to access the current semantics of a word.  This allows callers to track
* incremental changes to the semantics as the corpus is processed.  <p>
*
* The {@link #processSpace(Properties) processSpace} method does nothing for
* this class and calls to it will not affect the results of {@code
* getVector}.
*
* @author David Jurgens
*/
public class GenericWordSpace
        implements DimensionallyInterpretableSemanticSpace<String>, Filterable,
                   Serializable {

    private static final long serialVersionUID = 1L;

    public static final String GWS_SSPACE_NAME =
        "generic-word-space";

    /**
     * The prefix for naming public properties.
     */
    private static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.gws.GenericWordSpace";

    /**
     * The property to specify the number of words to view before and after each
     * word in focus.
     */
    public static final String WINDOW_SIZE_PROPERTY =
        PROPERTY_PREFIX + ".windowSize";

    /**
     * The property to specify whether the relative positions of a word's
     * co-occurrence should be use distinguished from each other.
     */
    public static final String USE_WORD_ORDER_PROPERTY =
        PROPERTY_PREFIX + ".useWordOrder";

    /**
     * The default number of words to view before and after each word in focus.
     */
    public static final int DEFAULT_WINDOW_SIZE = 2; // +2/-2
   
    /**
     * A mapping from each word to the vector the represents its semantics
     */
    private final Map<String,SparseIntegerVector> wordToSemantics;

    /**
     * The number of words to view before and after each focus word in a window.
     */
    private final int windowSize;

    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    private final Set<String> semanticFilter;

    /**
     * A mapping from a word an position to a specific dimension.  Note that if
     * word ordering is not being used, the dimension information is expected to
     * do nothing.
     */
    private final BasisMapping<Duple<String,Integer>,String> basisMapping;

    /**
     * Creates a new {@code GenericWordSpace} instance using the current {@code
     * System} properties for configuration.
     */
    public GenericWordSpace() {
        this(System.getProperties());
    }

    /**
     * Creates a new {@code GenericWordSpace} instance using the provided
     * properites for configuration.
     */
   public GenericWordSpace(Properties properties) {

        String windowSizeProp = properties.getProperty(WINDOW_SIZE_PROPERTY);
        windowSize = (windowSizeProp != null)
            ? Integer.parseInt(windowSizeProp)
            : DEFAULT_WINDOW_SIZE;

        String useWordOrderProp =
            properties.getProperty(USE_WORD_ORDER_PROPERTY);
        boolean useWordOrder = (useWordOrderProp != null)
            ? Boolean.parseBoolean(useWordOrderProp)
            : false;

        basisMapping = (useWordOrder)
            ? new WordOrderBasisMapping()
            : new WordBasisMapping();       

        wordToSemantics = new HashMap<String,SparseIntegerVector>(1024, 4f);
        semanticFilter = new HashSet<String>();
    }

    /**
     * Creates a new {@code GenericWordSpace} with the provided window size that
     * ignores word order.
     */
   public GenericWordSpace(int windowSize) {
       this(windowSize, new WordBasisMapping());
    }

    /**
     * Creates a new {@code GenericWordSpace} with the provided window size that
     * optionally includes word order.
     */
   public GenericWordSpace(int windowSize, boolean useWordOrder) {
       this(windowSize, (useWordOrder)
            ? new WordOrderBasisMapping()
            : new WordBasisMapping());
   }

    /**
     * Creates a new {@code GenericWordSpace} with the provided window size that
     * uses the specified basis mapping to map each co-occurrence at a specified
     * position to a dimension.
     *
     * @param basis a basis mapping from a duple that represents a word and its
     *        relative position to a dimension.
     */
   public GenericWordSpace(int windowSize,
                           BasisMapping<Duple<String,Integer>,String> basis) {
       this.windowSize = windowSize;
       this.basisMapping = basis;
       wordToSemantics = new HashMap<String,SparseIntegerVector>(1024, 4f);
       semanticFilter = new HashSet<String>();
   }

    /**
     * Removes all associations between word and semantics while still retaining
     * the words' basis mapping.  This method can be used to re-use the same
     * instance of a {@code GenericWordSpace} on multiple corpora while keeping
     * the semantics of the dimensions identical.
     */
    public void clearSemantics() {
        wordToSemantics.clear();
    }

    /**
     * Returns the current semantic vector for the provided word, or if the word
     * is not currently in the semantic space, a vector is added for it and
     * returned.
     *
     * @param word a word
     *
     * @return the {@code SemanticVector} for the provide word.
     */
    private SparseIntegerVector getSemanticVector(String word) {
        SparseIntegerVector v = wordToSemantics.get(word);
        if (v == null) {
            // lock on the word in case multiple threads attempt to add it at
            // once
            synchronized(this) {
                // recheck in case another thread added it while we were waiting
                // for the lock
                v = wordToSemantics.get(word);
                if (v == null) {
                    v = new CompactSparseIntegerVector(Integer.MAX_VALUE);
                    wordToSemantics.put(word, v);
                }
            }
        }
        return v;
    }

    /**
     * {@inheritDoc}
     */
    public String getDimensionDescription(int dimension) {
        return basisMapping.getDimensionDescription(dimension);
    }

   /**
     * {@inheritDoc} Note that because the word space is potentially growing as
     * new documents are processed, the length of the returned vector is equal
     * to the number of dimensions <i>at the time of this call</i> and therefore
     * may be less that the number of dimensions for the same word when obtained
     * at a later time.
     */
    public SparseIntegerVector getVector(String word) {
        SparseIntegerVector v = wordToSemantics.get(word);
        if (v == null) {
            return null;
        }
        // Note that because the word space is potentially ever growing, we wrap
        // the return vectors with the size of the semantic space at the time of
        // the call.
        return Vectors.immutable(Vectors.subview(v, 0, getVectorLength()));
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return GWS_SSPACE_NAME + "-w-" + windowSize
            + "-" + basisMapping;
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return basisMapping.numDimensions();
    }

    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return Collections.unmodifiableSet(wordToSemantics.keySet());
    }
   
    /**
     * Updates the semantic vectors based on the words in the document.
     *
     * @param document {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        Queue<String> prevWords = new ArrayDeque<String>(windowSize);
        Queue<String> nextWords = new ArrayDeque<String>(windowSize);

        Iterator<String> documentTokens =
            IteratorFactory.tokenizeOrdered(document);

        String focusWord = null;

        // prefetch the first windowSize words
        for (int i = 0; i < windowSize && documentTokens.hasNext(); ++i)
            nextWords.offer(documentTokens.next());
       
        while (!nextWords.isEmpty()) {
            focusWord = nextWords.remove();

            // shift over the window to the next word
            if (documentTokens.hasNext()) {
                String windowEdge = documentTokens.next();
                nextWords.offer(windowEdge);
            }   

            // If we are filtering the semantic vectors, check whether this word
            // should have its semantics calculated.  In addition, if there is a
            // filter and it would have excluded the word, do not keep its
            // semantics around
            boolean calculateSemantics =
                (semanticFilter.isEmpty() || semanticFilter.contains(focusWord))
                && !focusWord.equals(IteratorFactory.EMPTY_TOKEN);
           
            if (calculateSemantics) {
                SparseIntegerVector focusSemantics =
                    getSemanticVector(focusWord);

                // Keep track of the relative position of the focus word in case
                // word ordering is being used.
                int position = -prevWords.size(); // first word is furthest
                for (String word : prevWords) {
                    // Skip the addition of any words that are excluded from the
                    // filter set.  Note that by doing the exclusion here, we
                    // ensure that the token stream maintains its existing
                    // ordering, which is necessary when word order is taken
                    // into account.
                    if (word.equals(IteratorFactory.EMPTY_TOKEN)) {
                        position++;
                        continue;
                    }
                   
                    int dimension = basisMapping.getDimension(
                        new Duple<String,Integer>(word, position));
                    synchronized(focusSemantics) {
                        focusSemantics.add(dimension, 1);
                    }
                    position++;
                }
           
                // Repeat for the words in the forward window.
                position = 1;
                for (String word : nextWords) {
                    // Skip the addition of any words that are excluded from the
                    // filter set.  Note that by doing the exclusion here, we
                    // ensure that the token stream maintains its existing
                    // ordering, which is necessary when word order is taken
                    // into account.
                    if (word.equals(IteratorFactory.EMPTY_TOKEN)) {
                        ++position;
                        continue;
                    }
               
                    int dimension = basisMapping.getDimension(
                        new Duple<String,Integer>(word, position));
                    synchronized(focusSemantics) {
                        focusSemantics.add(dimension, 1);
                    }
                    position++;
                }
            }

            // Last put this focus word in the prev words and shift off the
            // front of the previous word window if it now contains more words
            // than the maximum window size
            prevWords.offer(focusWord);
            if (prevWords.size() > windowSize) {
                prevWords.remove();
            }
        }   

        document.close();
    }
   
    /**
     * Does nothing.
     *
     * @param properties {@inheritDoc}
     */
    public void processSpace(Properties properties) {
    }

    /**
     * {@inheritDoc} Note that all words will still have an index vector
     * assigned to them, which is necessary to properly compute the semantics.
     *
     * @param semanticsToRetain the set of words for which semantics should be
     *        computed.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        semanticFilter.clear();
        semanticFilter.addAll(semanticsToRetain);
    }

}
TOP

Related Classes of edu.ucla.sspace.gws.GenericWordSpace

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.