Source Code of edu.ucla.sspace.coals.Coals$EntryComp

/*
 * Copyright 2009 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.coals;




import edu.ucla.sspace.common.SemanticSpace;


import edu.ucla.sspace.matrix.CellMaskedSparseMatrix;
import edu.ucla.sspace.matrix.ArrayMatrix;
import edu.ucla.sspace.matrix.AtomicGrowingSparseMatrix;
import edu.ucla.sspace.matrix.MatlabSparseMatrixBuilder;
import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.MatrixBuilder;
import edu.ucla.sspace.matrix.MatrixFactorization;
import edu.ucla.sspace.matrix.MatrixFile;
import edu.ucla.sspace.matrix.MatrixIO;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.Normalize;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Transform;


import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;


import edu.ucla.sspace.text.IteratorFactory;


import java.io.BufferedReader;
import java.io.File;
import java.io.IOError;
import java.io.IOException;


import java.util.Arrays;
import java.util.ArrayList;
import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;


import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;


import java.util.concurrent.atomic.AtomicInteger;


import java.util.logging.Logger;




/**
 * An implementation of the COALS Semantic Space model.  This implementation is
 * based on:
 *
 * <p style="font-family:Garamond, Georgia, serif"> Rohde, D. L. T.,
 * Gonnerman, L. M., Plaut, D. C. (2005).  An Improved Model of Semantic
 * Similarity Based on Lexical Co-Occurrence. <i>Cognitive Science</i>
 * <b>(submitted)</b>.  Available <a
 * href="http://www.cnbc.cmu.edu/~plaut/papers/pdf/RohdeGonnermanPlautSUB-CogSci.COALS.pdf">here</a></p>
 *
 * COALS first computes a term by term co-occurrance using a ramped 4-word
 * window. Once all documents have been processed, the co-occurrence matrix will
 * be re ordered such that only the {@code N} most frequent terms have their
 * semantic vectors retained and only the {@code M} most frequent terms are used
 * as co-occurrence features.  These values can be set by the {@value
 *#MAX_WORDS_PROPERTY} and {@value MAX_DIMENSIONS_PROPERTY} properties,
 * resepctively.  After re ordering the semantic vectors and features, {@link
 * CorrelationTransform} is used to rerank all co-occurrence scores.  As part of
 * this transform, all negative correlations are dropped and replaced with a 0.
 * Finally, and optionally, the {@link SVD} is used to reduce the semantic
 * space.  To set the number of retained dimensions via {@link SVD}, set the
 * {@value REDUCE_DIMENSION_PROPERTY} property.
 *
 * @author Keith Stevens
 */
public class Coals implements SemanticSpace {


    /**
     * The property prefix for other settings.
     */
    public static final String PROPERTY_PREFIX = 
        "edu.ucla.sspace.coals.Coals";


    /**
     * Specifies whether or not the co-occurance matrix should be reduced.
     */
    public static final String REDUCE_MATRIX_PROPERTY =
        PROPERTY_PREFIX + ".reduce";


    /**
     * Specifies the number of dimensions the co-occurance matrix should be
     * reduced to.
     */
    public static final String REDUCE_DIMENSION_PROPERTY =
        PROPERTY_PREFIX + ".dimension";


    /**
     * Specifies the number of dimensions in the raw co-occurrance matrix to
     * maintain.
     */
    public static final String MAX_DIMENSIONS_PROPERTY = 
        PROPERTY_PREFIX + ".maxDimensions";


    /**
     * Specifies the number of words to build semantics for.
     */
    public static final String MAX_WORDS_PROPERTY = 
        PROPERTY_PREFIX + ".maxWords";


    /**
     * Specifies if Coals should not normalize the co-occurance matrix.
     */
    public static final String DO_NOT_NORMALIZE_PROPERTY = 
        PROPERTY_PREFIX + ".doNotNormalize";


    /**
     * The default number of dimensions to reduce to.
     */
    private static final int DEFAULT_REDUCE_DIMENSIONS = 800;


    /**
     * The default number of dimensions to save in the co-occurrance matrix.
     */
    private static final int DEFAULT_MAX_DIMENSIONS = 14000;


    /**
     * The default number of rows to save in the co-occurrance matrix.
     */
    private static final int DEFAULT_MAX_WORDS = 15000;


    /**
     * The name of this {@code SemanticSpace}
     */
    public static final String COALS_SSPACE_NAME = 
        "coals-semantic-space";


    /**
     * The logger used to record all output
     */
    private static final Logger COALS_LOGGER = 
        Logger.getLogger(Coals.class.getName());


    /**
     * A mapping from each word to the vector the represents its semantics
     */
    private Map<String, SparseDoubleVector> wordToSemantics;


    /**
     * A mapping from word to index number.
     */
    private Map<String, Integer> termToIndex;


    /**
     * A map containg the total frequency counts of each word.
     */
    private ConcurrentMap<String, AtomicInteger> totalWordFreq;


    /**
     * The final reduced matrix.
     */
    private Matrix finalCorrelation;


    /**
     * Specifies the number of reduced dimensions if the matrix is reduced by
     * SVD. 
     */
    private final int reducedDimensions;


    /**
     * The maximum number of words that will be retained by {@link Coals}.
     */
    private final int maxWords;


    /**
     * The maximum number of co-occurring words that will be retained by {@link
     * Coals}.
     */
    private final int maxDimensions;


    /**
     * A counter for keeping track of the index values of words.
     */
    private int wordIndexCounter;


    /**
     * The {@link MatrixFactorization} algorithm that will decompose the word by
     * document feature space into two smaller feature spaces: a word by class
     * feature space and a class by feature space.
     */
    private final MatrixFactorization reducer;


    /**
     * The {@link Transform} applied to the co-ocucrrence counts, if not {@code
     * null}.
     */
    private final Transform transform;


    public Coals(Transform transform, MatrixFactorization reducer) {
        this(transform, reducer, DEFAULT_REDUCE_DIMENSIONS,
             DEFAULT_MAX_WORDS, DEFAULT_MAX_DIMENSIONS);
    }


    /**
     * Creats a {@link Coals} instance.
     */
    public Coals(Transform transform,
                 MatrixFactorization reducer,
                 int reducedDimensions,
                 int maxWords,
                 int maxDimensions) {
        termToIndex = new HashMap<String, Integer>();
        totalWordFreq = new ConcurrentHashMap<String, AtomicInteger>();
        wordToSemantics = new HashMap<String, SparseDoubleVector>(1024, 4f);
        finalCorrelation = null;
        this.transform = transform;
        this.reducer = reducer;
        this.reducedDimensions = (reducedDimensions == 0) 
            ? DEFAULT_REDUCE_DIMENSIONS
            : reducedDimensions;
        this.maxWords = (maxWords == 0)
            ? DEFAULT_MAX_WORDS 
            : maxWords;
        this.maxDimensions = (maxDimensions == 0)
            ? DEFAULT_MAX_DIMENSIONS
            : maxDimensions;
    }


    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return termToIndex.keySet();
    }


    /**
     * {@inheritDoc}
     */
    public Vector getVector(String term) {
        Integer index = termToIndex.get(term);
        if (index == null) 
            return null;
        return Vectors.immutable(
                finalCorrelation.getRowVector(index.intValue()));
    }


    public String getSpaceName() {
        String ret = COALS_SSPACE_NAME;
        if (reducer != null)
            ret += "-svd-" + reducedDimensions;
        return ret;
    }


    public int getVectorLength() {
        return finalCorrelation.columns();
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        Map<String, Integer> wordFreq = new HashMap<String, Integer>();
        Map<String, SparseDoubleVector> wordDocSemantics =
            new HashMap<String, SparseDoubleVector>();


        // Setup queues to track the set of previous and next words in a
        // context.
        Queue<String> prevWords = new ArrayDeque<String>();
        Queue<String> nextWords = new ArrayDeque<String>();


        Iterator<String> it = IteratorFactory.tokenizeOrdered(document);


        for (int i = 0; i < 4 && it.hasNext(); ++i)
            nextWords.offer(it.next());


        // Compute the co-occurrance statistics of each focus word in the
        // document.
        while (!nextWords.isEmpty()) {


            // Slide over the context by one word.
            if (it.hasNext())
                nextWords.offer(it.next());


            // Get the focus word
            String focusWord = nextWords.remove();
            if (!focusWord.equals(IteratorFactory.EMPTY_TOKEN)) {
                getIndexFor(focusWord);


                // Update the frequency count of the focus word.
                Integer focusFreq = wordFreq.get(focusWord);
                wordFreq.put(focusWord, (focusFreq == null)
                        ? 1
                        : 1 + focusFreq.intValue());


                // Get the temprorary semantics for the focus word, create a new
                // vector for them if needed.
                SparseDoubleVector focusSemantics = wordDocSemantics.get(
                        focusWord);
                if (focusSemantics == null) {
                    focusSemantics = new SparseHashDoubleVector(
                            Integer.MAX_VALUE);
                    wordDocSemantics.put(focusWord, focusSemantics);
                }


                // Process the previous words.
                int offset = 4 - prevWords.size();
                for (String word : prevWords) {
                    offset++;
                    if (word.equals(IteratorFactory.EMPTY_TOKEN))
                        continue;
                    int index = getIndexFor(word);
                    focusSemantics.add(index, offset);
                }


                // Process the next words.
                offset = 5;
                for (String word : nextWords) {
                    offset--;
                    if (word.equals(IteratorFactory.EMPTY_TOKEN))
                        continue;
                    int index = getIndexFor(word);
                    focusSemantics.add(index, offset);
                }
            }


            prevWords.offer(focusWord);
            if (prevWords.size() > 4)
                prevWords.remove();
        }


        // Add the temporary vectors for each word in this document to the 
        // actual semantic fectors.
        for (Map.Entry<String, SparseDoubleVector> e :
                wordDocSemantics.entrySet()) {
            SparseDoubleVector focusSemantics = getSemanticVector(
                    e.getKey());
            // Get the non zero indices before hand so that they are cached
            // during the synchronized section.
            focusSemantics.getNonZeroIndices();
            synchronized (focusSemantics) {
                VectorMath.add(focusSemantics, e.getValue());
            }
        }


        // Store the total frequency counts of the words seen in this document
        // so far.
        for (Map.Entry<String, Integer> entry : wordFreq.entrySet()) {
            int count = entry.getValue().intValue();
            AtomicInteger freq = totalWordFreq.putIfAbsent(
                    entry.getKey(), new AtomicInteger(count));
            if (freq != null)
                freq.addAndGet(count);
        }
    }


    /**
     * Returns the current semantic vector for the provided word, or if the word
     * is not currently in the semantic space, a vector is added for it and
     * returned.
     *
     * @param word a word
     *
     * @return the {@code SemanticVector} for the provide word.
     */
    private SparseDoubleVector getSemanticVector(String word) {
        SparseDoubleVector v = wordToSemantics.get(word);
        if (v == null) {
            // lock on the word in case multiple threads attempt to add it at
            // once
            synchronized(this) {
                // recheck in case another thread added it while we were waiting
                // for the lock
                v = wordToSemantics.get(word);
                if (v == null) {
                    v = new CompactSparseVector();
                    wordToSemantics.put(word, v);
                }
            }
        }
        return v;
    }


    /**
     * Returns the index in the co-occurence matrix for this word.  If the word
     * was not previously assigned an index, this method adds one for it and
     * returns that index.
     */
    private int getIndexFor(String word) {
        Integer index = termToIndex.get(word);
        if (index == null) {     
            synchronized(this) {
                // recheck to see if the term was added while blocking
                index = termToIndex.get(word);
                // if another thread has not already added this word while the
                // current thread was blocking waiting on the lock, then add it.
                if (index == null) {
                    int i = wordIndexCounter++;
                    termToIndex.put(word, i);
                    return i; // avoid the auto-boxing to assign i to index
                }
            }
        }
        return index;
    }
                
    /**
     * {@inheritDoc}
     */
    public void processSpace(Properties props) {
        COALS_LOGGER.info("Droppring dimensions from co-occurrance matrix.");
        // Read in the matrix from a file with dimensions dropped.
        finalCorrelation = buildMatrix(maxWords, maxDimensions);
        COALS_LOGGER.info("Done dropping dimensions.");


        if (transform != null) {
            COALS_LOGGER.info("Normalizing co-occurrance matrix.");


            // Normalize the matrix using correlation.
            int wordCount = finalCorrelation.rows();
            finalCorrelation = transform.transform(finalCorrelation);
            COALS_LOGGER.info("Done normalizing co-occurrance matrix.");
        }


        if (reducer != null) {
            if (reducedDimensions > finalCorrelation.columns())
                throw new IllegalArgumentException(
                        "Cannot reduce to more dimensions than exist");


            COALS_LOGGER.info("Reducing using SVD.");
            try {
                File coalsMatrixFile =
                    File.createTempFile("coals-term-doc-matrix", "dat");
                coalsMatrixFile.deleteOnExit();
                MatrixIO.writeMatrix(finalCorrelation,
                                     coalsMatrixFile,
                                     Format.SVDLIBC_SPARSE_BINARY);


                MatrixFile processedSpace = new MatrixFile(
                        coalsMatrixFile, Format.SVDLIBC_SPARSE_BINARY);
                reducer.factorize(processedSpace, reducedDimensions);


                finalCorrelation = reducer.dataClasses();
            } catch (IOException ioe) {
                throw new IOError(ioe);
            }
            COALS_LOGGER.info("Done reducing using SVD.");
        }
    }


    /**
     * Returns a {@link Matrix} that contains {@code maxWords} and {@code
     * maxDimensions} columns.  If {@code maxWords} is 0, then all words will be
     * returned in the semantic {@link Matrix}.  If {@code maxDimensions} is
     * larger than the number of observed features, then all observed features
     * will be maintained.  The resulting rows and columns are both ordred based
     * on the frequency of each term, in descending order, {@code termToIndex}
     * is modified to account for these changed.
     */
    private Matrix buildMatrix(int maxWords, int maxDimensions) {
        // Convert the vectors in the semantic map to a matrix.
        SparseDoubleVector[] vectorList =
            new SparseDoubleVector[wordToSemantics.size()];
        for (Map.Entry<String, SparseDoubleVector> e :
                wordToSemantics.entrySet())
            vectorList[getIndexFor(e.getKey())] = e.getValue();
        SparseMatrix matrix = Matrices.asSparseMatrix(
                Arrays.asList(vectorList));


        // If maxwords was set to 0, save all words.
        if (maxWords == 0 || maxWords > wordToSemantics.size())
            maxWords = wordToSemantics.size();


        COALS_LOGGER.info("Forming the inverse mapping from terms to indices.");
        // Calculate an inverse mapping from index to word since the binary file
        // stores things by index number.
        String[] indexToTerm = new String[termToIndex.size()];
        for (Map.Entry<String, Integer> entry : termToIndex.entrySet())
            indexToTerm[entry.getValue()] = entry.getKey();


        COALS_LOGGER.info("Sorting the terms based on frequency.");
        // Calculate the new indices for each word that will be kept based on
        // the frequency count, where the most frequent word will be first.
        ArrayList<Map.Entry<String, AtomicInteger>> wordCountList =
            new ArrayList<Map.Entry<String, AtomicInteger>>(
                    totalWordFreq.entrySet());
        Collections.sort(wordCountList, new EntryComp());


        // Calculate the new term to index mapping based on the order of the
        // word frequencies.
        COALS_LOGGER.info("Generating the index masks.");


        // Compute the number of dimensions to maintain. 
        int wordCount = (wordCountList.size() > maxDimensions)
            ? maxDimensions 
            : wordCountList.size();


        int[] rowMask = new int[maxWords];
        int[] colMask = new int[wordCount];


        // Create a new vector list to store the word semantics that will be
        // retained.  When this method exits, it will throw away all the other
        // vectors.
        SparseDoubleVector[] newVectorList = new SparseDoubleVector[maxWords];


        // For each of the terms that we have a mapping, add row and column
        // maskings for the indices of the first maxWords terms.  For all other
        // terms, remove the term to index mapping.
        int termCount = 0;
        for (Map.Entry<String, AtomicInteger> entry : wordCountList) {
            Integer oldIndex = termToIndex.get(entry.getKey());


            // Skip any non mapped terms.
            if (oldIndex == null)
                continue;


            // Add a row and/or column mask from the index of this word to it's
            // index in the original matrix.
            if (termCount < maxWords) {
                if (termCount <  wordCount)
                    colMask[termCount] = oldIndex;
                // Add the vector for this reserved word to the new vector list.
                newVectorList[termCount] = vectorList[oldIndex];
                
                // Record the new dimension for this term.
                rowMask[termCount] = termCount;
                termToIndex.put(entry.getKey(), termCount);
                termCount++;
            }
            // Drop all other mappings.
            else
                termToIndex.remove(entry.getKey());
        }


        wordToSemantics = null;
        matrix = Matrices.asSparseMatrix(Arrays.asList(newVectorList));
        // Return a masked version of the original matrix.
        return new CellMaskedSparseMatrix(matrix, rowMask, colMask);
    }


    private class EntryComp
            implements Comparator<Map.Entry<String,AtomicInteger>> {
        public int compare(Map.Entry<String, AtomicInteger> o1,
                           Map.Entry<String, AtomicInteger> o2) {
            int diff = o2.getValue().get() - o1.getValue().get();
            return (diff != 0) ? diff : o2.getKey().compareTo(o1.getKey());
        }
    }
}
Source Code of edu.ucla.sspace.coals.Coals$EntryComp

Related Classes of edu.ucla.sspace.coals.Coals$EntryComp