Package edu.ucla.sspace.tools

Source Code of edu.ucla.sspace.tools.BasisMaker$OccurrenceCounter

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.tools;

import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.hal.LinearWeighting;
import edu.ucla.sspace.hal.WeightingFunction;

import edu.ucla.sspace.mains.GenericMain;

import edu.ucla.sspace.matrix.AtomicGrowingSparseHashMatrix;
import edu.ucla.sspace.matrix.NoTransform;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Transform;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.BoundedSortedMap;
import edu.ucla.sspace.util.ReflectionUtil;
import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectOutputStream;

import java.util.ArrayDeque;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;


/**
* This main creates a {@link BasisMapping} based on the unique terms found in a
* document set and serializes it to disk.
*
* @author Keith Stevens
*/
public class BasisMaker extends GenericMain {

    /**
     * {@inheritDoc}
     */
    protected void addExtraOptions(ArgOptions options) {
        options.addOption('T', "matrixTransform",
                          "Specifies the matrix transform that should be applied " +
                          "to co-occurrence counts after they have been generated",
                          true, "CLASSNAME", "Optional");
        options.addOption('b', "basisSize",
                          "Specifies the total desired size of the basis " +
                          "(Default: 10000)",
                          true, "INT", "Optional");
        options.addOption('w', "windowSize",
                          "Specifies the sliding window size (Default: 5)",
                          true, "INT", "Optional");
        options.addOption('p', "printWeights",
                          "If true, each saved word and it's associated weight " +
                          "will be printed to standard out",
                          false, null, "Optional");
    }

    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        Transform transform = null;
        if (argOptions.hasOption('T'))
            transform = ReflectionUtil.getObjectInstance(
                    argOptions.getStringOption('T'));
        else
            transform = new NoTransform();
        int bound = argOptions.getIntOption('b', 10000);
        int windowSize = argOptions.getIntOption('w', 5);
        return new OccurrenceCounter(transform, bound, windowSize);
    }

    /**
     * Saves the {@link BasisMapping} created from the {@link
     * OccurrenceCounter}.
     */
    protected void saveSSpace(SemanticSpace sspace, File outputFile)
            throws IOException{
        BasisMapping<String, String> savedTerms = new StringBasisMapping();
        for (String term : sspace.getWords())
            savedTerms.getDimension(term);

        ObjectOutputStream ouStream = new ObjectOutputStream(
                new FileOutputStream(outputFile));
        ouStream.writeObject(savedTerms);
        ouStream.close();
    }

    /**
     * A simple term {@link SemanticSpace} implementation that counts word
     * co-occurrences, performs a transform, and then scores each recorded basis
     * dimension based on the row summed scores for each word.
     */
    public class OccurrenceCounter implements SemanticSpace {

        /**
         * The matrix used for storing weight co-occurrence statistics of those
         * words that occur both before and after.
         */
        private final AtomicGrowingSparseHashMatrix cooccurrenceMatrix;

        /**
         * The type of weight to apply to a the co-occurrence word based on its
         * relative location
         */
        private final WeightingFunction weighting;

        /**
         * The {@link BasisMapping} used to record dimensions.
         */
        private final BasisMapping<String, String> basis;

        /**
         * The final scores for each word in the {@code basis}.
         */
        private final Map<String, Double> wordScores;

        /**
         * The {@link Transform} class used to rescore each word.
         */
        private final Transform transform;

        /**
         * The sliding window size used when traversing documents.
         */
        private final int windowSize;

        /**
         * Creates a new {@link OccurrenceCounter}.
         */
        public OccurrenceCounter(Transform transform,
                                 int bound,
                                 int windowSize) {
            cooccurrenceMatrix = new AtomicGrowingSparseHashMatrix();
            basis = new StringBasisMapping();
            wordScores = new BoundedSortedMap<String, Double>(bound);
            weighting = new LinearWeighting();

            this.transform = transform;
            this.windowSize = windowSize;
        }

        /**
         * {@inheritDoc}
         */
        public void processDocument(BufferedReader document)
               throws IOException {
            Queue<String> nextWords = new ArrayDeque<String>();
            Queue<String> prevWords = new ArrayDeque<String>();
                   
            Iterator<String> documentTokens =
                IteratorFactory.tokenizeOrdered(document);
                   
            String focus = null;

            // Rather than updating the matrix every time an occurrence is seen,
            // keep a thread-local count of what needs to be modified in the
            // matrix and update after the document has been processed.    This
            // saves potential contention from concurrent writes.
            Map<Pair<Integer>,Double> matrixEntryToCount =
                    new HashMap<Pair<Integer>,Double>();
                   
            //Load the first windowSize words into the Queue               
            for(int i = 0;    i < windowSize && documentTokens.hasNext(); i++)
                nextWords.offer(documentTokens.next());
                   
            while(!nextWords.isEmpty()) {
                // Load the top of the nextWords Queue into the focus word
                focus = nextWords.remove();

                // Add the next word to nextWords queue (if possible)
                if (documentTokens.hasNext())
                    nextWords.offer(documentTokens.next());

                // If the filter does not accept this word, skip the semantic
                // processing, continue with the next word
                if (focus.equals(IteratorFactory.EMPTY_TOKEN)) {
                    int focusIndex = basis.getDimension(focus);
                   
                    countOccurrences(nextWords, focusIndex,
                                     1, matrixEntryToCount);
                    countOccurrences(prevWords, focusIndex,
                                     -prevWords.size(), matrixEntryToCount);
                }

                // last, put this focus word in the prev words and shift off the
                // front if it is larger than the window
                prevWords.offer(focus);
                if (prevWords.size() > windowSize)
                    prevWords.remove();
            }

            // Once the document has been processed, update the co-occurrence
            // matrix accordingly.
            for (Map.Entry<Pair<Integer>,Double> e : matrixEntryToCount.entrySet()){
                Pair<Integer> p = e.getKey();
                cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue());
            }                                       
        }

        /**
         * Adds a occurnce count for each term in {@code words} according to
         * it's distance from the focus word.
         */
        private void countOccurrences(Queue<String> words,
                                      int focusIndex,
                                      int wordDistance,
                                      Map<Pair<Integer>, Double> entryCounts) {
            // Iterate through the words occurring after and add values
            for (String term : words) {
                // skip adding co-occurence values for words that are not
                // accepted by the filter
                if (!term.equals(IteratorFactory.EMPTY_TOKEN)) {
                    int index = basis.getDimension(term);
                   
                    // Get the current number of times that the focus word has
                    // co-occurred with this word appearing after it.    Weight
                    // the word appropriately based on distance
                    Pair<Integer> p = new Pair<Integer>(focusIndex, index);
                    double value = weighting.weight(wordDistance, windowSize);
                    Double curCount = entryCounts.get(p);
                    entryCounts.put(p, (curCount == null) ? value : value + curCount);
                }
                wordDistance++;
            }
        }

        /**
         * {@inheritDoc}
         */
        public Set<String> getWords() {
            return Collections.unmodifiableSet(wordScores.keySet());
        }

        /**
         * {@inheritDoc}
         */
        public DoubleVector getVector(String word) {
            Double score = wordScores.get(word);
            return (score == null)
                ? new DenseVector(new double[] {0})
                : new DenseVector(new double[] {score});
        }

        /**
         * {@inheritDoc}
         */
        public int getVectorLength() {
            return 1;
        }

        /**
         * {@inheritDoc}
         */
        public void processSpace(Properties properties) {
            SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(
                    cooccurrenceMatrix);
            for (String term : basis.keySet()) {
                int index = basis.getDimension(term);
                SparseDoubleVector sdv = cleanedMatrix.getRowVector(index);

                double score = 0;
                for (int i : sdv.getNonZeroIndices())
                    score += sdv.get(i);

                wordScores.put(term, score);
            }
        }

        /**
         * {@inheritDoc}
         */
        public String getSpaceName() {
            return "BasisMaker";
        }
    }
}
TOP

Related Classes of edu.ucla.sspace.tools.BasisMaker$OccurrenceCounter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.