/*
* Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
* the Lawrence Livermore National Laboratory. Written by Keith Stevens,
* kstevens@cs.ucla.edu OCEC-10-073 All rights reserved.
*
* This file is part of the C-Cat package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.ucla.sspace.mains;
import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.SemanticSpaceIO;
import edu.ucla.sspace.clustering.Clustering;
import edu.ucla.sspace.clustering.OnlineClustering;
import edu.ucla.sspace.text.Document;
import edu.ucla.sspace.util.Generator;
import edu.ucla.sspace.util.ReflectionUtil;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.wordsi.AssignmentReporter;
import edu.ucla.sspace.wordsi.ContextExtractor;
import edu.ucla.sspace.wordsi.ContextGenerator;
import edu.ucla.sspace.wordsi.EvaluationWordsi;
import edu.ucla.sspace.wordsi.GeneralContextExtractor;
import edu.ucla.sspace.wordsi.StreamingWordsi;
import edu.ucla.sspace.wordsi.WaitingWordsi;
import edu.ucla.sspace.wordsi.psd.PseudoWordContextExtractor;
import edu.ucla.sspace.wordsi.psd.PseudoWordReporter;
import edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor;
import edu.ucla.sspace.wordsi.semeval.SemEvalReporter;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.ObjectInputStream;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* A base implementation for {@link Wordsi} executables. This class provides
* base arguments that nearly all {@link Wordsi} executables will require, along
* with basic processing for those arguments.
*
* </p>
*
* This class provides access to three different word sense modes : online
* clustering, offline clustering, and an evaluation mode. For the two
* clustering modes, word senses are generated by clustering individual context
* vectors. The first mode uses {@link StreamingWordsi} and the latter mode
* uses {@link WaitingWordsi}. The third mode assumes that the word sense have
* already been learned and are fixed. Individual contexts are labeled with the
* most similar word sense.
*
* </p>
*
* This class provides access to two evaluation modes: Pseudo Word
* Discrimination and the SenseEval/SemEval evaluation. When training a {@link
* Wordsi} model for a pseudo word task, the {@code -e} option should be set
* with the "pseudoWord} argument. The {@code -P} option should be set so that
* {@link Wordsi} knows which words form pseudo words. {@link Wordsi} will
* generate a report that specifies how many times each core word in a pseudo
* word was assigned to a word sense for the pseudo word. When running in
* evaluation mode, the {@code -e} option must be set.
*
* </p>
*
* Since {@link Wordsi} instances will need to reuse features during training
* and testing, the {@code --Save} and {@code --Load} options are provided.
* {@code --Save} will store any data structures that are required for
* generating context vectors. {@code --Load} will load these same data
* structures from disk and re-use them. In general, {@code --Save} should be
* used during training and {@code --Load} should be used during testing.
* Different {@link Wordsi} executables will serialize different data
* structures, but these will generally be a mapping from strings to some other
* data type.
*
* </p>
*
* {@code GenericMain} provides the core options used by this base executible.
* This class provides the following addition options:
*
* <ul>
* <li><u>Required (one of)</u>:
* <ul>
* </li>{@code -s}, {@code --streamingClustering=CLASSNAME} Specifies the
* streaming clustering algorithm to use for forming word senses.
*
* </li> {@code -b}, {@code --batchClustering=CLASSNAME} Specifies the batch
* clustering algorithm to use for forming word senses.
*
* </li> {@code -e}, {@code --evaluationClustering=FILE} Specifies a trained
* Wordsi semantic space to be used for evaluation. When set, one of the
* Evaluation Type arguments must be set.
* </ul>
* </li>
*
* <li><u>Evaluation Type</u>
* <ul>
* </li> {@code -P}, {@code --pseudoWordEvaluation=FILENAME} Specifies a
* mapping from raw tokens to their pseudo word token. Only the raw tokens
* in this mapping will be represented in the {@link Wordsi} space. A
* {@link PseudoWordReporter} will be generated for these pseudo words.
*
* </li> {@code -E}, {@code --semEvalEvaluation=STRING} Signifies that the
* data files are in the SemEval format and that only test instance words
* should be represented in the Wordsi space. Each line must correspond to
* an instance context and the focus word must be precceded by the token
* given as the argument to this option.
* </ul>
* </li>
*
* <li><u>Optional</u>
* <ul>
* </li> {@code -a}, {@code --acceptedWords=FILENAME} Specifies the set of
* words which should be represented by Wordsi. (Default: all words).
*
* </li> {@code -c}, {@code --clusters} Specifies the desired number of
* clusters, or word senses. (Default: 0).
*
* </li> {@code -w}, {@code --windowSize} Specifies the number of words, in
* one direction, that form a valid context. For example, a window size of
* 5 means that up to 5 words before and after a focus word are used to form
* the context. (Default: 5).
*
* </ul>
* </li>
*
* <li><u>Serialization</u>
* <ul>
* </li> {@code -S}, {@code --save} Specfies a file to which all files
* needed to generate context vectors will be serialized.
*
* </li> {@code -L}, {@code --load} Specfies a file from which all files
* needed to generate context vectors will be deserialized.
* </ul>
* </li>
* </ul>
*
* @author Keith Stevens
*/
public abstract class GenericWordsiMain extends GenericMain {
private ObjectOutputStream saveStream = null;
private ObjectInputStream loadStream = null;
/**
* {@inheritDoc}
*/
protected void addExtraOptions(ArgOptions options) {
// Remove some crufty options.
options.removeOption('Z');
options.removeOption('X');
options.removeOption('o');
options.removeOption('w');
// Set the three runtime mode arguments.
options.addOption('s', "streamingClustering",
"Specifies the streaming clustering algorithm to " +
"use for forming word senses",
true, "CLASSNAME", "Required (one of)");
options.addOption('b', "batchClustering",
"Specifies the batch clustering algorithm to " +
"use for forming word senses",
true, "CLASSNAME", "Required (one of)");
options.addOption('e', "evaluationClustering",
"Specifies a trained Wordsi semantic space to be " +
"used for evaluation. When set, one of the " +
"Evaluation Type arguments must be set",
true, "<sspace>", "Required (one of)");
// Set the evaluation type arguments.
options.addOption('P', "pseudoWordEvaluation",
"Specifies a mapping from raw tokens to their " +
"pseudo word token. Only the raw tokens in this " +
"mapping will be represented in the Wordsi space. " +
"A PseudoWordReport will be generated for these " +
"pseudo words. This overrides the -a option",
true, "FILENAME", "Evaluation Type");
options.addOption('E', "semEvalEvaluation",
"Signifies that the data files are in the SemEval " +
"format and that only test instance words should " +
"be represented in the Wordsi space. Each line " +
"must correspond to an instance context and the " +
"focus word must be precceded by the token given " +
"as the argument to this option.",
true, "STRING", "Evaluation Type");
options.addOption('N', "wordlistEvaluation",
"Learned word senses are assumed to be related to " +
"the senses in for other words in the " +
"acceptedWords list. This evaluation will track " +
"the headers for documents which should mark " +
"whether or not the focus words are being used " +
"with their common sense.",
false, null, "Evaluation Type");
// Set the optional arguments.
options.addOption('a', "acceptedWords",
"Specifies the set of words which should be " +
"represented by Wordsi. (Default: all words)",
true, "FILENAME", "Optional");
options.addOption('c', "clusters",
"Specifies the desired number of clusters, or " +
"word senses. (Default: 0)",
true, "INT", "Optional");
options.addOption('W', "windowSize",
"Specifies the number of words, in one direction, " +
"that form a valid context. For example, a window " +
"size of 5 means that up to 5 words before and " +
"after a focus word are used to form the context. " +
"(Default: 5)",
true, "INT", "Optional");
options.addOption('h', "useHeaderToken",
"Set to true if the first token in a context " +
"should be treated as a document header. Note " +
"that this is only used when -E and -P are not " +
"used.",
false, null, "Optional");
// Set the serialization arguments.
options.addOption('S', "save",
"Specfies a file to which all files needed to " +
"generate context vectors will be serialized",
true, "FILENAME", "Serialization");
options.addOption('L', "load",
"Specfies a file from which all files needed to " +
"generate context vectors will be deserialized",
true, "FILENAME", "Serialization");
}
/**
* Returns a {@link ContextExtractor}, which will be responsible for
* creating context vectors for documents.
*/
abstract protected ContextExtractor getExtractor();
/**
* Returns a set of strings that the {@link Wordsi} implementations should
* represent, or {@code null}, which signifies that all words should be
* represented.
*/
protected Set<String> getAcceptedWords() {
if (!argOptions.hasOption('a'))
return null;
try {
Set<String> acceptedWords = new HashSet<String>();
BufferedReader br = new BufferedReader(new FileReader(
argOptions.getStringOption('a')));
for (String line = null; (line = br.readLine()) != null; )
acceptedWords.add(line.trim().toLowerCase());
return acceptedWords;
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Returns a mapping from real tokens to their pseudo word tokens, or {@code
* null} if the {@code -P} option is not specified.
*/
protected Map<String, String> getPseudoWordMap() {
if (!argOptions.hasOption('P'))
return null;
try {
Map<String, String> pseudoWordMap = new HashMap<String, String>();
BufferedReader br = new BufferedReader(new FileReader(
argOptions.getStringOption('P')));
for (String line = null; (line = br.readLine()) != null; ) {
String[] tokens = line.split("\\s+");
pseudoWordMap.put(tokens[0].trim(), tokens[1].trim());
}
return pseudoWordMap;
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Returns a {@link ContextExtractor} that uses the given {@link
* ContextGenerator} which will process the corpus in the format specified
* by the command line. This is just a helper function for sub-classes
* implementing {@link #getExtractor}.
*/
protected ContextExtractor contextExtractorFromGenerator(
ContextGenerator generator) {
// If experimentation mode is set, mark the generator as read only.
if (argOptions.hasOption('e'))
generator.setReadOnly(true);
// If the evaluation type is for semEval, use a
// SemEvalContextExtractor.
if (argOptions.hasOption('E'))
return new SemEvalContextExtractor(
generator, windowSize(), argOptions.getStringOption('E'));
// If the evaluation type is for pseudoWord, use a
// PseudoWordContextExtractor.
if (argOptions.hasOption('P'))
return new PseudoWordContextExtractor(
generator, windowSize(), getPseudoWordMap());
// Return a standard context extractor
return new GeneralContextExtractor(generator, windowSize(),
argOptions.hasOption('h'));
}
/**
* Returns the window size used in a sliding context window.
*/
protected int windowSize() {
return argOptions.getIntOption('W', 5);
}
protected Iterator<Document> getDocumentIterator() throws IOException {
Iterator<Document> docIter = super.getDocumentIterator();
// If we are not using the pseudo word evalutor, just return the
// iterator as normal. The SemEval corpora already have their contexts
// shuffled so there is no worry about biasing the results towards a
// particular sense.
if (!argOptions.hasOption('P'))
return docIter;
// Otherwise, read in all of the documents into a list, shuffle it, and
// return an iterator over that list. This is needed to ensure that the
// ordering does not bias the clustering algorithm. NOTE that this
// assumes that the entire corpus can fit into memory.
List<Document> docList = new LinkedList<Document>();
while (docIter.hasNext())
docList.add(docIter.next());
Collections.shuffle(docList);
return docList.iterator();
}
/**
* {@inheritDoc}
*/
protected SemanticSpace getSpace() {
ArgOptions options = argOptions;
// Setup the assignment reporter. When training, the assignment report
// will only be used If the evaluation mode will be for pseudoWord.
AssignmentReporter reporter = null;
if (options.hasOption('P'))
reporter = new PseudoWordReporter(System.out);
int numClusters = options.getIntOption('c', 0);
// If Wordsi is being used in an evaluation mode, set up word space
// accordingly.
if (options.hasOption('e')) {
// If the evaluation type is not set, report an error and exit.
if (!options.hasOption('E') && !options.hasOption('P')) {
usage();
System.out.println(
"An Evaluation Type must be set when evaluating " +
" a trained Wordsi model.");
System.exit(1);
}
// Load the semantic space that has the predefined word senses from
// disk and return an EvaluationWordsi instance.
try {
SemanticSpace sspace = SemanticSpaceIO.load(
options.getStringOption('e'));
if (options.hasOption('E'))
reporter = new SemEvalReporter(System.out);
return new EvaluationWordsi(
getAcceptedWords(), getExtractor(), sspace, reporter);
} catch (IOException ioe) {
throw new IOError(ioe);
}
} else if (options.hasOption('s')) {
// Create a StreamingWordsi instance that uses the specified online
// cluster generator.
System.getProperties().setProperty(
OnlineClustering.NUM_CLUSTERS_PROPERTY,
options.getStringOption('c'));
Generator<OnlineClustering<SparseDoubleVector>> clusterGenerator =
ReflectionUtil.getObjectInstance(options.getStringOption('s'));
return new StreamingWordsi(getAcceptedWords(), getExtractor(),
clusterGenerator, reporter, numClusters);
} else if (options.hasOption('b')) {
// Create a WaitingWordsi instance that uses the specified batch
// clustering implementation.
Clustering clustering =
ReflectionUtil.getObjectInstance(options.getStringOption('b'));
return new WaitingWordsi(getAcceptedWords(), getExtractor(),
clustering, reporter, numClusters);
} else {
// None of the required options was provided, report an error and
// exit.
usage();
System.out.println("No clustering method was specified.");
System.exit(1);
return null;
}
}
/**
* Returns an {@link ObjectOutputStream} for the file referred to by the
* {@code --Save} option or {@link null} if the option was not used.
*/
protected ObjectOutputStream openSaveFile() {
try {
if (saveStream == null && argOptions.hasOption('S'))
saveStream = new ObjectOutputStream(new FileOutputStream(
argOptions.getStringOption('S')));
return saveStream;
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Returns an {@link ObjectInputStream} for the file referred to by the
* {@code
* --Load} option or {@link null} if the option was not used.
*/
protected ObjectInputStream openLoadFile() {
try {
if (loadStream == null && argOptions.hasOption('L'))
loadStream = new ObjectInputStream(new FileInputStream(
argOptions.getStringOption('L')));
return loadStream;
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Writes the {@code obj} to the given {@link ObjectOutputStream}.
*/
protected void saveObject(ObjectOutputStream outStream, Object obj) {
try {
outStream.writeObject(obj);
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Returns an object of type {@code T} from the provided {@link
* ObjectInputStream}. This method does the casting, so assignments should
* be done directly to a pointer and not through a ternary operator,
* otherwise the cast will need to be done a second time.
*/
@SuppressWarnings("unchecked")
protected <T> T loadObject(ObjectInputStream inStream) {
try {
return (T) inStream.readObject();
} catch (IOException ioe) {
throw new IOError(ioe);
} catch (ClassNotFoundException cnfe) {
throw new IOError(cnfe);
}
}
}