Package edu.ucla.sspace.common

Source Code of edu.ucla.sspace.common.DocumentVectorBuilder

/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.common;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.VectorMath;

import java.io.BufferedReader;

import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;


/**
* {@code DocumentVectorBuilder} generates {@code Vector} representations of a
* document, based on semantic {@code Vector}s provided for a {@code
* SemanticSpace}.  This can be consider as a projecting the document into the
* semantic space.
*
* </p>
*
* Documents will be tokenized using the current tokenizing
* method, and the vector in the {@code SemanticSpace} corresponding to each
* word found in the document will be combined together. 
*
* </p>
* Options for combining term {@code Vector}s include summation, average, and
* term frequency weighting.
*
* @author Keith Stevens
*/
public class DocumentVectorBuilder {

    /**
     * The base prefix for all properties.
     */
    private static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.common.DocumentVectorBuilder";

    /**
     * The property to specify if term frequencies should be used when combining
     * term vectors.
     */
    public static final String USE_TERM_FREQUENCIES_PROPERTY =
        PROPERTY_PREFIX + ".usetf";

    /**
     * The {@code SemanticSpace} which will provide a {@code Vector} for each
     * word found in a document.
     */
    private final SemanticSpace sspace;

    private final boolean useTermFreq;

    /**
     * Creates a {@code DocumentVectorBuilder} from a {@code SemanticSpace} and
     * extracts options from the system wide {@code Properties}.
     */
    public DocumentVectorBuilder(SemanticSpace baseSpace) {
        this(baseSpace, System.getProperties());
    }

    /**
     * Creates a {@code DocumentVectorBuilder} from a {@code SemanticSpace} and
     * extracts options from the given {@code Properties}.
     */
    public DocumentVectorBuilder(SemanticSpace baseSpace, Properties props) {
        sspace = baseSpace;
        useTermFreq = props.getProperty(USE_TERM_FREQUENCIES_PROPERTY) != null;
    }

    /**
     * Represent a document as the summation of term Vectors.
     *
     * @param document A {@code BufferedReader} for a document to project into a
     *                 {@code SemanticSpace}.
     * @param documentVector A {@code Vector} which has been pre-allocated to
     *                       store the document's representation.  This is
     *                       pre-allocated so that users of {@code
     *                       DocumentVectorBuilder} can decide what type of
     *                       {@code Vector} should be used to represent a
     *                       document.
     *
     * @return {@code documentVector} after it has been modified to represent
     *         the terms in {@code document}.
     */
    public DoubleVector buildVector(BufferedReader document,
                              DoubleVector documentVector) {
        // Tokenize and determine what words exist in the document, along with
        // the requested meta information, such as a term frequency.
        Map<String, Integer> termCounts = new HashMap<String, Integer>();
        Iterator<String> articleTokens = IteratorFactory.tokenize(document);
        while (articleTokens.hasNext()) {
            String term = articleTokens.next();
            Integer count = termCounts.get(term);
            termCounts.put(term, (count == null || !useTermFreq)
                                 ? 1 : count.intValue() + 1);
        }

        // Iterate through each term in the document and sum the term Vectors
        // found in the provided SemanticSpace.
        for (Map.Entry<String, Integer> entry : termCounts.entrySet()) {
            Vector termVector = sspace.getVector(entry.getKey());
            if (termVector == null)
                continue;
            add(documentVector, termVector, entry.getValue());
        }

        return documentVector;
    }

    public void add(DoubleVector dest, Vector src, int factor) {
        if (src instanceof SparseVector) {
            int[] nonZeros = ((SparseVector) src). getNonZeroIndices();
            for (int i : nonZeros)
                dest.add(i, src.getValue(i).doubleValue());
        } else {
            for (int i = 0; i < src.length(); ++i)
                dest.add(i, src.getValue(i).doubleValue());
        }
    }
}
TOP

Related Classes of edu.ucla.sspace.common.DocumentVectorBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.