Source Code of edu.ucla.sspace.tri.OrderedTemporalRandomIndexing

/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.tri;


import edu.ucla.sspace.common.Filterable;


import edu.ucla.sspace.ri.RandomIndexing;


import edu.ucla.sspace.temporal.TemporalSemanticSpace;


import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;


import java.io.BufferedReader;
import java.io.IOException;


import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedSet;


import java.util.logging.Logger;




/**
 * A simplified version of {@link TemporalRandomIndexing} that imposes
 * restrictions on the document input ordering to improve efficiency at the cost
 * of functionality.  Specifically, this class assumes: <ol>
 *
 *  <li> Documents will be processed in an on-line manner such that all
 *  documents that comprise a semantic slice will be contiguous.
 *
 *  <li> After a semantic slice has been built and processed, it does not need
 *  to be referenced any longer may be discarded.
 *
 * </ol>
 *
 * The first property requires that the intial data be sorted according to some
 * predetermined ordering.  The second property limits the semantics that are
 * retained at any given time period. <p>
 *
 * Because each slice is calculated and then discarded, this class provides a
 * way for users to be notified when a semantic slice has been completed.  Users
 * may add a {@link Runnable} via the {@link #addPartitionHook(Runnable)} method.
 * When the input stream of documents partitions the current semantic slice from
 * the next (i.e. the slice is complete), each runnable will be invoked.  This
 * allows users to perform any operations on the slice as necessary, such as
 * save it to disk or compute various statistics.<p>
 *
 * This class implements {@link Filterable}, which allows for fine-grained
 * control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
 * method can be used to speficy which words should have their semantics
 * retained.  Note that the words that are filtered out will still be used in
 * computing the semantics of <i>other</i> words.  This behavior is intended for
 * use with a large corpora where retaining the semantics of all words in memory
 * is infeasible.<p>
 *
 * This base class defines the following configurable properties:
 *
 * <dl style="margin-left: 1em">
 *
 * <dt> <i>Property:</i> <code><b>{@value #WINDOW_SIZE_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@value #DEFAULT_WINDOW_SIZE}
 *
 * <dd style="padding-top: .5em">This variable sets the number of words before
 *      and after that are counted as co-occurring.  With the default value,
 *      {@code 5} words are counted before and {@code 5} words are counter
 *      after.  This class always uses a symmetric window. <p>
 *
 * <dt> <i>Property:</i> <code><b>{@value #VECTOR_LENGTH_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@value #DEFAULT_VECTOR_LENGTH}
 *
 * <dd style="padding-top: .5em">This variable sets the number of dimensions to
 *      be used for the index and semantic vectors. <p>
 *
 * <dt> <i>Property:</i> <code><b>{@value #USE_SPARSE_SEMANTICS_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@code true} 
 *
 * <dd style="padding-top: .5em">This property specifies whether to use a sparse
 *       encoding for each word's semantics.  Using a sparse encoding can result
 *       in a large saving in memory, while requiring more time to process each
 *       document.<p>
 *
 * </dl> <p>
 *
 * Due to the ordered nature of its processing, great care must be used when
 * invoking {@code processDocument} from multiple threads.  Multiple threads may
 * order the documents such that the time stamps at semantic slice boundaries
 * overlap.  This may causes the {@link shouldPartitionSpace(long)} method to
 * return true for slices with only a single document.  Subclasses must make it
 * clear whether any such multithreading behavior is permissable and how to
 * correctly invoke it to avoid triggering semantic slice boundary edge cases.<p>
 *
 * In its base behavior, instances of this class do <i>not</i> support the
 * optional {@code getTimeSteps}, {@code getVectorAfter}, {@code
 * getVectorBefore} and {@code getVectorBetween} methods.  However, subclasses
 * may add this functionality.<p>
 *
 * @see RandomIndexing
 * @see TemporalRandomIndexing
 * @see TemporalSemanticSpace
 *
 * @author David Jurgens
 */
public abstract class OrderedTemporalRandomIndexing 
        implements TemporalSemanticSpace, Filterable {


    /**
     * The prefix for naming public properties.
     */
    private static final String PROPERTY_PREFIX = 
    "edu.ucla.sspace.tri.OrderedTemporalRandomIndexing";


    /**
     * The property to specify the fully qualified named of a {@link
     * edu.ucla.sspace.ri.PermutationFunction} if using permutations is enabled.
     */
    public static final String PERMUTATION_FUNCTION_PROPERTY = 
    PROPERTY_PREFIX + ".permutationFunction";


    /**
     * The property to specify whether the index vectors for co-occurrent words
     * should be permuted based on their relative position.
     */
    public static final String USE_PERMUTATIONS_PROPERTY = 
    PROPERTY_PREFIX + ".usePermutations";


    /**
     * Specifies whether to use a sparse encoding for each word's semantics,
     * which saves space but requires more computation.
     */
    public static final String USE_SPARSE_SEMANTICS_PROPERTY = 
    PROPERTY_PREFIX + ".sparseSemantics";


    /**
     * The property to specify the number of dimensions to be used by the index
     * and semantic vectors.
     */
    public static final String VECTOR_LENGTH_PROPERTY = 
    PROPERTY_PREFIX + ".vectorLength";


    /**
     * The property to specify the number of words to view before and after each
     * word in focus.
     */
    public static final String WINDOW_SIZE_PROPERTY = 
    PROPERTY_PREFIX + ".windowSize";


    /**
     * The default number of dimensions to be used by the index and semantic
     * vectors.
     */
    public static final int DEFAULT_VECTOR_LENGTH = 10000;


    /**
     * The default number of words to view before and after each word in focus.
     */
    public static final int DEFAULT_WINDOW_SIZE = 4; // +4/-4


    /**
     * The logger used for instances of this class
     */
    private static final Logger LOGGER =
    Logger.getLogger(OrderedTemporalRandomIndexing.class.getName());
    
    /**
     * The collection of hooks that are to be run prior to every time this
     * instances partitions its semantic space.
     */
    protected final Collection<Runnable> partitionHooks;


    /**
     * The current semantic slice, which is updated as new documents are
     * processed and has its semantics cleared when {@link
     * #shouldPartitionSpace(long)} returns {@code true}.
     */
    protected final RandomIndexing currentSlice;


    /**
     * The most recent time stamp seen during the current semantic slice 
     */
    protected Long endTime;


    /**
     * The least recent time stamp seen during the current semantic slice
     */
    protected Long startTime;
    
    /**
     * Creates an instance of {@code OrderedTemporalRandomIndexing} using
     * the system properties to configure the behavior.
     */
    public OrderedTemporalRandomIndexing() {
        this(System.getProperties());
    }


    /**
     * Creates an instance of {@code OrderedTemporalRandomIndexing} using
     * the system properties to configure the behavior.
     *
     * @param props the properties used to configure this instance
     */
    public OrderedTemporalRandomIndexing(Properties props) {


        partitionHooks = new ArrayList<Runnable>();
        
        // Translate the On-line TRI properties into RI properties
        Properties riProps = new Properties();


        // Conditionally assign any of the specified Ordered TRI properties to
        // the RI instance if they were set
        String prop = null;
        if ((prop = props.getProperty(VECTOR_LENGTH_PROPERTY)) != null)
            riProps.put(RandomIndexing.VECTOR_LENGTH_PROPERTY, prop);


        if ((prop = props.getProperty(WINDOW_SIZE_PROPERTY)) != null)
            riProps.put(RandomIndexing.WINDOW_SIZE_PROPERTY, prop);


        if ((prop = props.getProperty(USE_SPARSE_SEMANTICS_PROPERTY)) != null)
            riProps.put(RandomIndexing.USE_SPARSE_SEMANTICS_PROPERTY, prop);


        currentSlice = new RandomIndexing(riProps);
    }


    /**
     * Adds the provided {@code Runnable} to the list of hooks that will be
     * invoked immediately <i>prior</i> to the partitioning of this space.  This
     * method provides a mechanism for users to perform additional processing on
     * the current semantic slice of this space before it is discarded.
     *
     * @param hook a runnable to be invoked.
     */
    public void addPartitionHook(Runnable hook) {
        partitionHooks.add(hook);
    }


    /**
     * Clears the semantic content of this space as a part of the partitioning
     * processing.
     */
    protected void clear() {
        // Reset the current semantic slice
        currentSlice.clearSemantics();


        // Clear the start and end times, which will be reset after the next
        // document is processed following the clear() operation
        startTime = null;
        endTime = null;
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        processDocument(document, System.currentTimeMillis());
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document, long timeStamp) 
        throws IOException {


        if (startTime != null && shouldPartitionSpace(timeStamp)) {
            for (Iterator<Runnable> it = partitionHooks.iterator(); 
                 it.hasNext(); ) {
                Runnable r = it.next();
                // If one of the hooks has errors, remove it from processing but
                // don't stop processing.
                try {
                    r.run();
                } catch (Throwable t) {
                    LOGGER.warning("Partition hook " + r + " caused the " + 
                           "following exception during its operations" +
                           t + " and is being removed");
                    it.remove();
                }
            }
            clear();
        }
        
        // Update the semantic slice ranges as necessary
        if (startTime == null) {
            startTime = timeStamp;
            endTime = timeStamp;
        }
        else if (endTime < timeStamp)
            timeStamp = endTime;


        currentSlice.processDocument(document);
    }


    /**
     * Sets a filter such that only words that are in the set have their
     * semantics retained by this instance.  Note that all words will still have
     * an index vector assigned to them, which is necessary to properly compute
     * the semantics.
     *
     * @param semanticsToRetain the set of words for which semantics should be
     *        computed.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        currentSlice.setSemanticFilter(semanticsToRetain);
    }


    /**
     * Returns {@code true} if the current contents of this semantic space
     * should be partitioned and discarded <i>prior</i> to processing the next
     * document with the specified time stamp.  Subclasses should use this
     * method to specify the conditions under which the temporal semantics are
     * to be divided.
     *
     * @param nextTimeStamp the time stamp of the next document that has yet to
     *        be processed
     *
     * @return {@code true} if the current contents of this space should be
     *         partitioned and discarded before processing the next document
     */
    protected abstract boolean shouldPartitionSpace(long nextTimeStamp);


    /**
     * {@inheritDoc}
     */
    public Long startTime() {
        return startTime;
    }


    /**
     * {@inheritDoc}
     */
    public Long endTime() {
        return endTime;
    }


    /**
     * {@inheritDoc}
     */ 
    public abstract String getSpaceName();


    /**
     * <i>Not supported</i>
     *
     * @param word {@inheritDoc}
     *
     * @throws UnsupportedOperationException if called
     */ 
    public SortedSet<Long> getTimeSteps(String word) {
        throw new UnsupportedOperationException(
            "getTimeSteps is not supported");
    }
    
    /**
     * <i>Not supported</i>
     *
     * @param word {@inheritDoc}
     * @param startTime {@inheritDoc}
     *
     * @throws UnsupportedOperationException if called
     */ 
    public Vector getVectorAfter(String word, long startTime) {
        throw new UnsupportedOperationException(
            "getVectorAfter is not supported");
    }


    /**
     * <i>Not supported</i>
     *
     * @param word {@inheritDoc}
     * @param endTime {@inheritDoc}
     *
     * @throws UnsupportedOperationException if called
     */ 
    public Vector getVectorBefore(String word, long endTime) {
        throw new UnsupportedOperationException(
            "getVectorBefore is not supported");
    }


    /**
     * <i>Not supported</i>
     *
     * @param word {@inheritDoc}
     * @param startTime {@inheritDoc}
     * @param endTime {@inheritDoc}
     *
     * @throws UnsupportedOperationException if called
     */ 
    public Vector getVectorBetween(String word, long startTime, 
                     long endTime) {
        throw new UnsupportedOperationException(
            "getVectorBetween is not supported");
    }


    /**
     * {@inheritDoc}
     */
    public Vector getVector(String word) {
        return currentSlice.getVector(word);
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return currentSlice.getVectorLength();
    }




    /**
     * {@inheritDoc} Note that this set only includes the words that are present
     * in the current semantic slice, which may be a subset of the all the words
     * seen in all semantic slices.
     */
    public Set<String> getWords() {
        return currentSlice.getWords();
    }


    /**
     * Returns an unmodifiable view on the token to {@link TernaryVector}
     * mapping used by this instance.  Any further changes made by this instance
     * to its token to {@code TernaryVector} mapping will be reflected in the
     * return map.
     *
     * @return a mapping from the current set of tokens to the index vector used
     *         to represent them
     */
    public Map<String,TernaryVector> getWordToIndexVector() {
        return currentSlice.getWordToIndexVector();
    }


    /**
     * Does nothing.
     *
     * @param props {@inheritDoc}
     */
    public void processSpace(Properties props) { }


    /**
     * Assigns the token to {@link TernaryVector} mapping to be used by this
     * instance.  The contents of the map are copied, so any additions of new
     * index words by this instance will not be reflected in the parameter's
     * mapping.
     *
     * @param m a mapping from token to the {@code TernaryVector} that should be
     *        used represent it when calculating other word's semantics
     */
    public void setWordToIndexVector(Map<String,TernaryVector> m) {
        currentSlice.setWordToIndexVector(m);
    }


}
Source Code of edu.ucla.sspace.tri.OrderedTemporalRandomIndexing

Related Classes of edu.ucla.sspace.tri.OrderedTemporalRandomIndexing