Source Code of edu.ucla.sspace.tools.DependencyBasisMaker$OccurrenceCounter

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */




package edu.ucla.sspace.tools;


import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;


import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;


import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractorManager;
import edu.ucla.sspace.dependency.DependencyIterator;
import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyPathAcceptor;
import edu.ucla.sspace.dependency.DependencyPathWeight;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.FilteredDependencyIterator;
import edu.ucla.sspace.dependency.FlatPathWeight;
import edu.ucla.sspace.dependency.UniversalPathAcceptor ;


import edu.ucla.sspace.mains.DependencyGenericMain;


import edu.ucla.sspace.matrix.AtomicGrowingSparseHashMatrix;
import edu.ucla.sspace.matrix.NoTransform;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Transform;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.BoundedSortedMap;
import edu.ucla.sspace.util.Pair;


import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectOutputStream;


import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;




/**
 * This main creates a {@link BasisMapping} based on the unique terms found in a
 * document set and serializes it to disk.
 *
 * @author Keith Stevens
 */
public class DependencyBasisMaker extends DependencyGenericMain {


    /**
     * {@inheritDoc}
     */
    public void addExtraOptions(ArgOptions options) { 
        options.addOption('b', "basisSize",
                          "Specifies the total desired size of the basis " +
                          "(Default: 10000)",
                          true, "INT", "Optional");
        options.addOption('a', "pathAcceptor",
                          "Specifies the dependency path acceptor to use. " +
                          "(Default:    UnivseralPathAcceptor)",
                          true, "CLASSNAME", "Optional");
        options.addOption('w', "pathWeighter",
                          "Specifies the dependency path weighter to use. " +
                          "(Default:    FlatPathWeight)",
                          true, "CLASSNAME", "Optional");
        options.addOption('l', "pathLength",
                          "Specifies the maximum dependency path length. " +
                          "(Default:    5)",
                          true, "INT", "Optional");
    }


    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        setupDependencyExtractor();


        int bound = argOptions.getIntOption('b', 10000);
        Transform transform = argOptions.getObjectOption(
                'T', new NoTransform());
        DependencyPathAcceptor acceptor = argOptions.getObjectOption(
                'a', new UniversalPathAcceptor());
        DependencyPathWeight weighter = argOptions.getObjectOption(
                'w', new FlatPathWeight());
        int pathLength = argOptions.getIntOption('l', 5);
        return new OccurrenceCounter(
                transform, bound, acceptor, weighter, pathLength);
    }


    /**
     * Saves the {@link BasisMapping} created from the {@link
     * OccurrenceCounter}.
     */
    protected void saveSSpace(SemanticSpace sspace, File outputFile)
            throws IOException{
        BasisMapping<String, String> savedTerms = new StringBasisMapping();
        for (String term : sspace.getWords())
            savedTerms.getDimension(term);


        ObjectOutputStream ouStream = new ObjectOutputStream(
                new FileOutputStream(outputFile));
        ouStream.writeObject(savedTerms);
        ouStream.close();
    }


    /**
     * A simple term {@link SemanticSpace} implementation that counts word
     * co-occurrences, performs a transform, and then scores each recorded basis
     * dimension based on the row summed scores for each word.
     */
    public class OccurrenceCounter implements SemanticSpace {


        /**
         * The matrix used for storing weight co-occurrence statistics of those
         * words that occur both before and after.
         */
        private final AtomicGrowingSparseHashMatrix cooccurrenceMatrix;


        /**
         * The {@link BasisMapping} used to record dimensions.
         */
        private final BasisMapping<String, String> basis;


        /**
         * The final scores for each word in the {@code basis}.
         */
        private final Map<String, Double> wordScores;


        /**
         * The {@link Transform} class used to rescore each word.
         */
        private final Transform transform;


        /**
         * The {@link DependencyPathAcceptor} used to accept or reject
         * dependency paths.
         */
        private final DependencyPathAcceptor acceptor;


        /**
         * The {@link DependencyPathWeight} used to score dependency paths.
         */
        private final DependencyPathWeight weighter;


        /**
         * The maximum path length that is acceptable.
         */
        private final int pathLength;


        /**
         * The {@link DependencyExtractor} used to extract parse trees from each
         * document.
         */
        private final DependencyExtractor extractor;


        /**
         * Creates a new {@link OccurrenceCounter}.
         */
        public OccurrenceCounter(Transform transform,
                                 int bound, 
                                 DependencyPathAcceptor acceptor,
                                 DependencyPathWeight weighter,
                                 int pathLength) {
            cooccurrenceMatrix = new AtomicGrowingSparseHashMatrix();
            basis = new StringBasisMapping();
            wordScores = new BoundedSortedMap<String, Double>(bound);
            extractor = DependencyExtractorManager.getDefaultExtractor();


            this.transform = transform;
            this.acceptor = acceptor;
            this.weighter = weighter;
            this.pathLength = pathLength;
        }


        /**
         * {@inheritDoc}
         */
        public void processDocument(BufferedReader document)
                throws IOException {
            // Rather than updating the matrix every time an occurrence is
            // seen, keep a thread-local count of what needs to be modified
            // in the matrix and update after the document has been
            // processed.  This saves potential contention from concurrent
            // writes.
            Map<Pair<Integer>,Double> matrixEntryToCount = 
                    new HashMap<Pair<Integer>,Double>();


            // Iterate over all of the parseable dependency parsed sentences in
            // the document.
            for (DependencyTreeNode[] nodes = null; 
                    (nodes = extractor.readNextTree(document)) != null; ) {


                // Skip empty documents.
                if (nodes.length == 0)
                    continue;                        


                // Examine the paths for each word in the sentence.
                for (int wordIndex = 0; wordIndex < nodes.length; ++wordIndex) {
                    String focusWord = nodes[wordIndex].word();                            
                    int focusIndex = basis.getDimension(focusWord);


                    // Get all the valid paths starting from this word.    The
                    // acceptor will filter out any paths that don't contain the
                    // semantic connections we're looking for.
                    Iterator<DependencyPath> paths =
                        new FilteredDependencyIterator(
                                nodes[wordIndex], acceptor, pathLength);
                            
                    // For each of the paths rooted at the focus word, update
                    // the co-occurrences of the focus word in the dimension
                    // that the BasisFunction states.
                    while (paths.hasNext()) {
                        DependencyPath path = paths.next();


                        String occurrence = path.last().word();
                        int featureIndex = basis.getDimension(occurrence);


                        double score = weighter.scorePath(path);
                        matrixEntryToCount.put(new Pair<Integer>(
                                    focusIndex, featureIndex), score);
                    }
                }
            }


            // Once the document has been processed, update the co-occurrence
            // matrix accordingly.
            for (Map.Entry<Pair<Integer>,Double> e :
                    matrixEntryToCount.entrySet()){
                    Pair<Integer> p = e.getKey();
                    cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue());
            }
        }


        /**
         * {@inheritDoc}
         */
        public Set<String> getWords() {
            return Collections.unmodifiableSet(wordScores.keySet());
        }


        /**
         * {@inheritDoc}
         */
        public DoubleVector getVector(String word) {
            Double score = wordScores.get(word);
            return (score == null)
                ? new DenseVector(new double[] {0})
                : new DenseVector(new double[] {score});
        }


        /**
         * {@inheritDoc}
         */
        public int getVectorLength() {
            return 1;
        }


        /**
         * {@inheritDoc}
         */
        public void processSpace(Properties properties) {
            SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(
                    cooccurrenceMatrix);
            for (String term : basis.keySet()) {
                int index = basis.getDimension(term);
                SparseDoubleVector sdv = cleanedMatrix.getRowVector(index);


                double score = 0;
                for (int i : sdv.getNonZeroIndices())
                    score += sdv.get(i);


                wordScores.put(term, score);
            }
        }


        /**
         * {@inheritDoc}
         */
        public String getSpaceName() {
            return "BasisMaker";
        }
    }
}
Source Code of edu.ucla.sspace.tools.DependencyBasisMaker$OccurrenceCounter

Related Classes of edu.ucla.sspace.tools.DependencyBasisMaker$OccurrenceCounter