/*
* Copyright 2010 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.ucla.sspace.vsm;
import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.GenericTermDocumentVectorSpace;
import edu.ucla.sspace.matrix.MatrixBuilder;
import edu.ucla.sspace.matrix.MatrixFile;
import edu.ucla.sspace.matrix.MatrixIO;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.SvdlibcSparseBinaryMatrixBuilder;
import edu.ucla.sspace.matrix.Transform;
import edu.ucla.sspace.util.ReflectionUtil;
import edu.ucla.sspace.vector.DoubleVector;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.ConcurrentMap;;
/**
* An implementation of the <a
* href="http://en.wikipedia.org/wiki/Vector_space_model">Vector Space Model</a>
* (VSM). This model was first based on the paper <ul>
*
* <li style="font-family:Garamond, Georgia, serif"> G. Salton, A. Wong, and
* C. S. Yang (1975), "A Vector Space Model for Automatic Indexing,"
* Communications of the ACM, vol. 18, nr. 11, pages 613–620. Available <a
* href="http://doi.acm.org/10.1145/361219.361220">here</a> </li>
*
* </ul>
*
* <p>
*
* The VSM first processes documents into a word-document matrix where each
* unique word is a assigned a row in the matrix, and each column represents a
* document. The values of ths matrix correspond to the number of times the
* row's word occurs in the column's document. Optionally, after the matrix has
* been completely, its values may be transformed. This is frequently done
* using the {@link edu.ucla.sspace.matrix.TfIdfTransform Tf-Idf Transform}.
*
* <p>
*
* This class offers one configurable parameter.
*
* <dl style="margin-left: 1em">
*
* <dt> <i>Property:</i> <code><b>{@value #MATRIX_TRANSFORM_PROPERTY}
* </b></code> <br>
* <i>Default:</i> none
*
* <dd style="padding-top: .5em">This variable sets the preprocessing algorithm
* to use on the term-document matrix. The property value should be the
* fully qualified named of a class that implements {@link Transform}. The
* class should be public, not abstract, and should provide a public no-arg
* constructor.<p>
*
* </dl> <p>
*
* <p>
*
* This class is thread-safe for concurrent calls of {@link
* #processDocument(BufferedReader) processDocument}. Once {@link
* #processSpace(Properties) processSpace} has been called, no further calls to
* {@code processDocument} should be made. This implementation does not support
* access to the semantic vectors until after {@code processSpace} has been
* called.
*
* @see Transform
*
* @author David Jurgens
*/
public class VectorSpaceModel extends GenericTermDocumentVectorSpace {
/**
* The prefix for naming publically accessible properties
*/
private static final String PROPERTY_PREFIX =
"edu.ucla.sspace.vsm.VectorSpaceModel";
/**
* The property to define the {@link Transform} class to be used
* when processing the space after all the documents have been seen.
*/
public static final String MATRIX_TRANSFORM_PROPERTY =
PROPERTY_PREFIX + ".transform";
/**
* The name prefix used with {@link #getName()}
*/
private static final String VSM_SSPACE_NAME =
"vector-space-model";
/**
* Constructs the {@code VectorSpaceModel} using the system properties
* for configuration.
*
* @throws IOException if this instance encounters any errors when creatng
* the backing array files required for processing
*/
public VectorSpaceModel() throws IOException {
super(false, new StringBasisMapping(),
new SvdlibcSparseBinaryMatrixBuilder());
}
/**
* Constructs a new {@code VectorSpaceModel} using the provided
* objects for processing.
*
* @param readHeaderToken If true, the first token of each document will be
* read and passed to {@link #handleDocumentHeader(int, String)
* handleDocumentHeader}, which discards the header.
* @param termToIndex The {@link BasisMapping} used to map strings to
* indices.
* @param termDocumentMatrixBuilder The {@link MatrixBuilder} used to write
* document vectors to disk which later get processed in {@link
* #processSpace(Properties) processSpace}.
*
* @throws IOException if this instance encounters any errors when creatng
* the backing array files required for processing
*/
public VectorSpaceModel(
boolean readHeaderToken,
BasisMapping<String, String> termToIndex,
MatrixBuilder termDocumentMatrixBuilder) throws IOException {
super(readHeaderToken, termToIndex, termDocumentMatrixBuilder);
}
/**
* {@inheritDoc}
*/
public String getSpaceName() {
return VSM_SSPACE_NAME;
}
/**
* Returns the vector corresponding to a document processed by this space.
* Vector values represent the word frequencies that have been transformed
* according to the {@link Transform} instances provided to {@link
* #processSpace(Properites)}.
*
* <p> This method requires that {@code processSpace} has been called first
* to ensure the semantic space's state is properly constructed. Calls
* before this point will throw an {@link IllegalStateException}.
*
* Implementation note: If a specific document ordering is needed, caution
* should be used when using this class in a multi-threaded environment.
* Beacuse the document number is based on what order it was
* <i>processed</i>, no guarantee is made that this will correspond with the
* original document ordering as it exists in the corpus files. However, in
* a single-threaded environment, the ordering will be preserved.
*
* @param documentNumber {@inheritDoc}
*
* @return {@inheritDoc}
*
* @throws IllegalArgumentException {@inheritDoc}
* @throws IllegalStateException {@inheritDoc}
*/
@Override
public DoubleVector getDocumentVector(int documentNumber) {
if (wordSpace == null)
throw new IllegalStateException(
"The document space has not yet been generated.");
if (documentNumber < 0 || documentNumber >= wordSpace.columns()) {
throw new IllegalArgumentException(
"Document number is not within the bounds of the number of "
+ "documents: " + documentNumber);
}
return wordSpace.getColumnVector(documentNumber);
}
/**
* Returns the number of documents processed by {@link
* VectorSpaceModel}.
*
* @throws IllegalStateException If the document space has not been
* retained.
*/
@Override
public int documentSpaceSize() {
if (wordSpace == null)
throw new IllegalStateException(
"The document space has not yet been generated.");
return wordSpace.columns();
}
/**
* {@inheritDoc}
*
* @param properties {@inheritDoc} See this class's {@link VectorSpaceModel
* javadoc} for the full list of supported properties.
*/
public void processSpace(Properties properties) {
try {
Transform transform = null;
// Load any optionally specifie transform class
String transformClass =
properties.getProperty(MATRIX_TRANSFORM_PROPERTY);
if (transformClass != null)
transform = ReflectionUtil.getObjectInstance(
transformClass);
MatrixFile processedSpace = super.processSpace(transform);
System.out.printf("Matrix saved in %s as %s%n",
processedSpace.getFile(),
processedSpace.getFormat());
wordSpace = MatrixIO.readMatrix(processedSpace.getFile(),
processedSpace.getFormat());
System.out.printf("loaded word space of %d x %d%n",
wordSpace.rows(), wordSpace.columns());
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
}