/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.ucla.sspace.tools;
import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.DimensionallyInterpretableSemanticSpace;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.SemanticSpaceIO;
import edu.ucla.sspace.common.Similarity;
import edu.ucla.sspace.text.WordIterator;
import edu.ucla.sspace.util.NearestNeighborFinder;
import edu.ucla.sspace.util.PartitioningNearestNeighborFinder;
import edu.ucla.sspace.util.SimpleNearestNeighborFinder;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.VectorIO;
import edu.ucla.sspace.util.SortedMultiMap;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A utility class that operates as a command-line tool for interacting with
* semantic space files. The utility also provides script execution
* capabilities for its commands. This allows users to develop custom methods
* of interacting with one or more semantic spaces. In additoin, scripting can
* help automate certain forms of tests on the expected contents of a semantic
* space.
*
* @author David Jurgens
*/
public class SemanticSpaceExplorer {
/**
* A set of commands that can be issued to the semantic space explorer.
*/
private enum Command {
LOAD,
UNLOAD,
GET_NEIGHBORS,
GET_SIMILARITY,
COMPARE_SSPACE_VECTORS,
HELP,
WRITE_COMMAND_RESULTS,
SET_CURRENT_SSPACE,
GET_CURRENT_SSPACE,
PRINT_VECTOR,
ALIAS,
GET_WORDS,
DESCRIBE_DIMENSION,
DESCRIBE_SEMANTIC_SPACE
}
/**
* A mapping from the abbreviation for a command to its {@link Command}
* instance.
*/
private static final Map<String,Command> abbreviatedCommands
= new HashMap<String,Command>();
// For each of the commands, take the first letter of each word in its name
// to form the abbreviated command string.
static {
for (Command c : Command.values()) {
String[] commandWords = c.toString().split("_");
StringBuilder abbv = new StringBuilder();
for (String w : commandWords)
abbv.append(w.charAt(0));
abbreviatedCommands.put(abbv.toString().toLowerCase(), c);
}
}
/**
* The mapping from file name to the {@code SemanticSpace} that was loaded
* from that file.
*/
private final Map<String,SemanticSpace> fileNameToSSpace;
/**
* The mapping from the alias of a semantic space to the file name from
* which it was loaded.
*/
private final Map<String,String> aliasToFileName;
/**
* The current {@code SemanticSpace} to be used when invoking commands
*/
private SemanticSpace current;
/**
* The {@code NearestNeighborFinder} for the current {@code SemanticSpace}
* or {@code null} if the nearest terms have yet to be searched for.
*/
private NearestNeighborFinder currentNnf;
/**
* Constructs an instance of {@code SemanticSpaceExplorer}.
*/
private SemanticSpaceExplorer() {
fileNameToSSpace = new LinkedHashMap<String,SemanticSpace>();
aliasToFileName = new HashMap<String,String>();
current = null;
}
/**
* Returns the name of the file form which the current {@code SemanticSpace}
* was loaded, or {@code null} if no semantic space is currently open.
*
* @return the name of the file from which the current space was loaded
*/
private String getCurrentSSpaceFileName() {
// REMINDER: This instruction is expected to be rare, so rather than
// save the name and require a lookup every time the current sspace
// is needed, we use an O(n) call to find the name as necessary
for (Map.Entry<String,SemanticSpace> e : fileNameToSSpace.entrySet()) {
if (e.getValue() == current) {
return e.getKey();
}
}
return null;
}
/**
* Returns the {@code SemanticSpace} linked to the name, either as an alias
* or as a file name.
*
* @param name the alias or file name of a loaded semantic space
*
* @return the loaded semantic space or {@code null} no space with the
* provided name exists
*/
private SemanticSpace getSSpace(String name) {
String aliased = aliasToFileName.get(name);
return (aliased != null)
? fileNameToSSpace.get(aliased)
: fileNameToSSpace.get(name);
}
/**
* Executes the specified command and writes any output to standard out. If
* an error occurs an error message will be written instead.
*
* @param commandTokens the series of tokens that comprise the command and
* all of its arguments
*
* @return {@code true} if the command was successfully executed
*/
public boolean execute(Iterator<String> commandTokens) {
return execute(commandTokens, System.out);
}
/**
* Executes the specified command and writes any output to the provided
* stream. If an error occurs an error message will be written to the
* stream instead.
*
* @param commandTokens the series of tokens that comprise the command and
* all of its arguments
* @param out the stream to which any output should be written
*
* @return {@code true} if the command was successfully executed
*/
private boolean execute(Iterator<String> commandTokens, PrintStream out) {
// No-op for empty commands
if (!commandTokens.hasNext())
return false;
// Convert the name of the command into a Command Enum
String commandStr = commandTokens.next();
Command command = null;
try {
command =
Command.valueOf(commandStr.replaceAll("-", "_").toUpperCase());
} catch (IllegalArgumentException iae) {
command = abbreviatedCommands.get(commandStr);
if (command == null) {
out.println("Unknown command: " + commandStr);
return false;
}
}
// A giant switch statement for all of the commands
command_switch:
switch (command) {
// Loads the semantic space from a file
case LOAD: {
if (!commandTokens.hasNext()) {
out.println("missing .sspace file argument");
return false;
}
String sspaceFileName = commandTokens.next();
// Don't re-open .sspace files that are already loaded
if (fileNameToSSpace.containsKey(sspaceFileName))
break;
SemanticSpace sspace = null;
try {
sspace = SemanticSpaceIO.load(sspaceFileName);
} catch (Throwable t) {
// Catch Throwable since this method may throw an IOError
out.println("an error occurred while loading the semantic " +
"space from " + sspaceFileName + ":\n" + t);
t.printStackTrace();
}
fileNameToSSpace.put(sspaceFileName, sspace);
current = sspace;
currentNnf = null;
break;
}
// Removes all references to the space, which free the associated
// memory.
case UNLOAD: {
if (!commandTokens.hasNext()) {
out.println("missing .sspace file argument");
return false;
}
String sspaceName = commandTokens.next();
String aliased = aliasToFileName.get(sspaceName);
SemanticSpace removed = null;
if (aliased != null) {
aliasToFileName.remove(sspaceName);
removed = fileNameToSSpace.remove(aliased);
}
else {
removed = fileNameToSSpace.remove(sspaceName);
// Remove the alias for the file if it existed
Iterator<Map.Entry<String,String>> it =
aliasToFileName.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String,String> e = it.next();
if (e.getValue().equals(sspaceName)) {
it.remove();
break;
}
}
}
// If we are removing the current semantic space, reassign it to be
// the oldest semantic space, or if none are available, null.
if (removed == current) {
Iterator<SemanticSpace> it =
fileNameToSSpace.values().iterator();
current = (it.hasNext()) ? it.next() : null;
}
break;
}
// Creates an alias for a semantic space file. This is useful for long
// file names.
case ALIAS: {
if (!commandTokens.hasNext()) {
out.println("missing .sspace file argument");
return false;
}
String fileName = commandTokens.next();
if (!fileNameToSSpace.containsKey(fileName)) {
out.println(fileName + "is not currently loaded");
return false;
}
if (!commandTokens.hasNext()) {
out.println("missing alias name");
return false;
}
String alias = commandTokens.next();
aliasToFileName.put(alias, fileName);
break;
}
// Finds the nearest neighbors to a word in the current semantic space
case GET_NEIGHBORS: {
if (!commandTokens.hasNext()) {
out.println("missing word argument");
return false;
}
String focusWord = commandTokens.next();
int neighbors = 10;
if (commandTokens.hasNext()) {
String countStr = commandTokens.next();
try {
neighbors = Integer.parseInt(countStr);
} catch (NumberFormatException nfe) {
out.println("invalid number of neighbors: " + countStr);
return false;
}
}
// If this is the first time the nearest neighbors have been
// searched for, construct a new NNF
if (currentNnf == null)
currentNnf = new PartitioningNearestNeighborFinder(current);
// Using the provided or default arguments find the closest
// neighbors to the target word in the current semantic space
SortedMultiMap<Double,String> mostSimilar =
currentNnf.getMostSimilar(focusWord, neighbors);
if (mostSimilar == null) {
out.println(focusWord +
" is not in the current semantic space");
}
else {
// Print each of the neighbors and their similarity score
for (Map.Entry<Double,String> e : mostSimilar.entrySet()) {
out.println(e.getValue() + "\t" + e.getKey());
}
}
break;
}
// Get the similarity for two words
case GET_SIMILARITY: {
if (current == null) {
out.println("no current semantic space");
return false;
}
if (!commandTokens.hasNext()) {
out.println("missing word argument");
return false;
}
String word1 = commandTokens.next();
if (!commandTokens.hasNext()) {
out.println("missing word argument");
return false;
}
String word2 = commandTokens.next();
Similarity.SimType simType = Similarity.SimType.COSINE;
if (commandTokens.hasNext()) {
// Upper case since it's an enum
String simTypeStr = commandTokens.next().toUpperCase();
try {
simType = Similarity.SimType.valueOf(simTypeStr);
} catch (IllegalArgumentException iae) {
// See if the user provided a prefix of the similarity
// measure's name
for (Similarity.SimType t : Similarity.SimType.values())
if (t.name().startsWith(simTypeStr))
simType = t;
// If no prefix was found, report an error
if (simType == null) {
out.println("invalid similarity measure: " +simTypeStr);
return false;
}
}
}
Vector word1vec = current.getVector(word1);
if (word1vec == null) {
out.println(word1 + " is not in semantic space "
+ getCurrentSSpaceFileName());
break;
}
Vector word2vec = current.getVector(word2);
if (word2vec == null) {
out.println(word2 + " is not in semantic space "
+ getCurrentSSpaceFileName());
break;
}
double similarity =
Similarity.getSimilarity(simType, word1vec, word2vec);
out.println(similarity);
break;
}
// Compare the vectors for the same word from two different semantic
// spaces
case COMPARE_SSPACE_VECTORS: {
if (!commandTokens.hasNext()) {
out.println("missing word argument");
return false;
}
String word = commandTokens.next();
if (!commandTokens.hasNext()) {
out.println("missing sspace argument");
return false;
}
String name1 = commandTokens.next();
SemanticSpace sspace1 = getSSpace(name1);
if (sspace1 == null) {
out.println("no such semantic space: " + name1);
return false;
}
if (!commandTokens.hasNext()) {
out.println("missing sspace argument");
return false;
}
String name2 = commandTokens.next();
SemanticSpace sspace2 = getSSpace(name2);
if (sspace2 == null) {
out.println("no such semantic space: " + name2);
return false;
}
Similarity.SimType simType = Similarity.SimType.COSINE;
if (commandTokens.hasNext()) {
String simTypeStr = commandTokens.next();
try {
simType = Similarity.SimType.valueOf(simTypeStr);
} catch (IllegalArgumentException iae) {
out.println("invalid similarity measure: " + simTypeStr);
return false;
}
}
// Get the vectors from each dimension
Vector sspace1vec = sspace1.getVector(word);
if (sspace1vec == null) {
out.println(word + " is not in semantic space "
+ name1);
break;
}
Vector sspace2vec = sspace2.getVector(word);
if (sspace2vec == null) {
out.println(word + " is not in semantic space "
+ name2);
break;
}
// Ensure that the two have the same number of dimensions
if (sspace1vec.length() != sspace2vec.length()) {
out.println(name1 + " and " + name2 + " have different numbers "
+ "of dimensions and are not comparable.");
break;
}
double similarity =
Similarity.getSimilarity(simType, sspace1vec, sspace2vec);
out.println(similarity);
break;
}
case HELP: {
out.println("available commands:\n" + getCommands());
break;
}
// Write the results of a command to a file
case WRITE_COMMAND_RESULTS: {
if (!commandTokens.hasNext()) {
out.println("missing file destination argument");
return false;
}
String fileName = commandTokens.next();
try {
// Open up a new output stream where the command's results will
// be sent
PrintStream ps = new PrintStream(fileName);
// Recursively call execute using the file as the new output
// stream
execute(commandTokens, ps);
ps.close();
} catch (IOException ioe) {
out.println("An error occurred while writing to " + fileName +
":\n" + ioe);
}
break;
}
// Print the vector for a word
case PRINT_VECTOR: {
if (current == null) {
out.println("no current semantic space");
return false;
}
if (!commandTokens.hasNext()) {
out.println("missing word argument");
return false;
}
String word = commandTokens.next();
Vector vec = current.getVector(word);
if (vec == null) {
out.println(word + " is not in semantic space " +
getCurrentSSpaceFileName());
break;
}
out.println(VectorIO.toString(vec));
break;
}
// Update the current semantic space
case SET_CURRENT_SSPACE: {
if (!commandTokens.hasNext()) {
out.println("missing .sspace file argument");
return false;
}
String spaceName = commandTokens.next();
// Check whether the name was an alias
String fileName = aliasToFileName.get(spaceName);
// If the argument wasn't an alias, the arg was the file name
if (fileName == null)
fileName = spaceName;
SemanticSpace s = fileNameToSSpace.get(fileName);
if (s == null) {
out.println("no such .sspace (file is not currently loaded)");
return false;
}
current = s;
break;
}
// Get the name of the current semantic space
case GET_CURRENT_SSPACE: {
String currentSpaceName = getCurrentSSpaceFileName();
if (currentSpaceName != null)
out.println(currentSpaceName);
else
out.println("none");
break;
}
// Prints out the words in the semantic space
case GET_WORDS: {
String prefix = null;
if (commandTokens.hasNext())
prefix = commandTokens.next();
Set<String> words = current.getWords();
for (String word : words) {
if (prefix == null)
out.println(word);
else if (word.startsWith(prefix))
out.println(word);
}
break;
}
// Describes the dimension, if the current sspace has annotations
case DESCRIBE_DIMENSION: {
if (current instanceof DimensionallyInterpretableSemanticSpace) {
if (!commandTokens.hasNext()) {
out.println("Must supply a dimension number");
break;
}
int dim = -1;
String next = commandTokens.next();
try {
dim = Integer.parseInt(next);
} catch (NumberFormatException nfe) {
out.println("Invalid dimension: " + next);
break;
}
DimensionallyInterpretableSemanticSpace<?> diss =
(DimensionallyInterpretableSemanticSpace)current;
try {
out.println(diss.getDimensionDescription(dim).toString());
} catch (Exception e) {
out.println(e.getMessage());
}
}
else
out.println("Current space has no dimension descriptions");
break;
}
// Prints out statistics on the current sspaces
case DESCRIBE_SEMANTIC_SPACE: {
if (current == null) {
out.println("no .sspace loaded");
break;
}
String name = current.getSpaceName();
boolean hasDimDescriptions =
current instanceof DimensionallyInterpretableSemanticSpace;
int dims = current.getVectorLength();
int words = current.getWords().size();
boolean isSparse = (current.getWords().isEmpty()) ||
current.getVector(current.getWords().iterator().next())
instanceof SparseVector;
out.println(name + ": " + words + " words, "
+ dims + " dimensions"
+ ((hasDimDescriptions)
? " with descriptions" : "")
+ ((isSparse) ? ", sparse vectors"
: ", dense vectors"));
break;
}
default: // should never get executed
assert false : command;
}
return true;
}
/**
* Returns a formatted list of the available commands that a {@code
* SemanticSpaceExplorer} instance will recognize.
*
* @return the commands
*/
private static String getCommands() {
return
" load file1.sspace [file2.sspace...]\n" +
" unload file1.sspace [file2.sspace...]\n" +
" get-neighbors word [number (default 10)] [similarity measure]\n" +
" get-similarity word1 word2 [similarity measure " +
"(default cosine)]\n" +
" compare-sspace-vectors word sspace1 sspace2 " +
"[similarity measure (default: cosine)]\n" +
" help\n" +
" set-current-sspace filename.sspace\n" +
" get-current-sspace\n" +
" alias filename.sspace name\n" +
" write-command-results output-file command...\n" +
" print-vector word\n" +
" get-words [string-prefix]\n" +
" describe-dimension number\n" +
" describe-semantic-space\n";
}
/**
* Prints the options and supported commands used by this program.
*
* @param options the options supported by the system
*/
private static void usage(ArgOptions options) {
System.out.println("usage: java SemanticSpaceExplorer [options]\n\n" +
"Command line options:\n" + options.prettyPrint() +
"\n\nExplorer commands:\n" + getCommands());
}
public static void main(String[] args) {
ArgOptions options = new ArgOptions();
options.addOption('h', "help", "Generates a help message and exits",
false, null, "Program Options");
options.addOption('f', "executeFile", "Executes the commands in the " +
"specified file and exits", true, "FILE",
"Program Options");
options.addOption('s', "saveRecord", "Saves a record of all the " +
"executed commands to the specfied file", true,
"FILE", "Program Options");
options.parseOptions(args);
if (options.hasOption("help")) {
usage(options);
return;
}
PrintWriter recordFile = null;
if (options.hasOption("saveRecord")) {
try {
recordFile = new PrintWriter(
options.getStringOption("saveRecord"));
} catch (IOException ioe) {
System.out.println("Unable to open file for saving commands:\n"
+ ioe);
}
}
BufferedReader commandsToExecute = null;
if (options.hasOption("executeFile")) {
try {
commandsToExecute = new BufferedReader(new FileReader(
options.getStringOption("executeFile")));
} catch (IOException ioe) {
System.out.println("unable to open commands file " +
options.getStringOption("executeFile")
+ ":\n" + ioe);
return;
}
}
else {
commandsToExecute =
new BufferedReader(new InputStreamReader(System.in));
}
boolean suppressPrompt = options.hasOption("executeFile");
SemanticSpaceExplorer explorer = new SemanticSpaceExplorer();
Pattern regex = Pattern.compile("[^\\s\"']+|\"([^\"]*)\"|'([^']*)'");
try {
if (!suppressPrompt)
System.out.print("> ");
for (String command = null;
(command = commandsToExecute.readLine()) != null; ) {
// Break the string, but recognize compound tokens as encoded
// with "" characters
List<String> tokens = new ArrayList<String>();
Matcher regexMatcher = regex.matcher(command);
while (regexMatcher.find()) {
if (regexMatcher.group(1) != null) {
// Add double-quoted string without the quotes
tokens.add(regexMatcher.group(1));
} else if (regexMatcher.group(2) != null) {
// Add single-quoted string without the quotes
tokens.add(regexMatcher.group(2));
} else {
// Add unquoted word
tokens.add(regexMatcher.group());
}
}
// Iterator<String> commandTokens = new WordIterator(command);
if (explorer.execute(tokens.iterator()) && recordFile != null) {
recordFile.println(command);
}
if (!suppressPrompt)
System.out.print("> ");
}
} catch (IOException ioe) {
System.out.println("An error occurred while reading in a command:\n"
+ ioe);
}
if (recordFile != null) {
recordFile.close();
}
}
}