package edu.stanford.nlp.sentiment;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.LabeledScoredTreeNode;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.CollectionUtils;
import java.util.function.Function;
import edu.stanford.nlp.util.Generics;
/**
* Reads the sentiment dataset and writes it to the appropriate files.
*
* @author John Bauer
*/
public class ReadSentimentDataset {
static final Function<Tree, String> TRANSFORM_TREE_TO_WORD = tree -> tree.label().value();
static final Function<String, String> TRANSFORM_PARENS = word -> {
if (word.equals("(")) {
return "-LRB-";
}
if (word.equals(")")) {
return "-RRB-";
}
return word;
};
// A bunch of trees have some funky tokenization which we can
// somewhat correct using these tregex / tsurgeon expressions.
static final TregexPattern[] tregexPatterns = {
TregexPattern.compile("__=single <1 (__ < /^-LRB-$/) <2 (__ <... { (__ < /^[a-zA-Z]$/=letter) ; (__ < /^-RRB-$/) }) > (__ <2 =single <1 (__=useless <<- (__=word !< __)))"),
TregexPattern.compile("__=single <1 (__ < /^-LRB-$/) <2 (__ <... { (__ < /^[aA]$/=letter) ; (__ < /^-RRB-$/) }) > (__ <1 =single <2 (__=useless <<, /^n$/=word))"),
TregexPattern.compile("__=single <1 (__ < /^-LRB-$/) <2 (__=A <... { (__ < /^[aA]$/=letter) ; (__=paren < /^-RRB-$/) })"),
TregexPattern.compile("__ <1 (__ <<- (/^(?i:provide)$/=provide !<__)) <2 (__ <<, (__=s > __=useless <... { (__ <: -LRB-) ; (__ <1 (__ <: s)) } ))"),
TregexPattern.compile("__=single <1 (__ < /^-LRB-$/) <2 (__ <... { (__ < /^[a-zA-Z]$/=letter) ; (__ < /^-RRB-$/) }) > (__ <1 =single <2 (__=useless <<, (__=word !< __)))"),
TregexPattern.compile("-LRB-=lrb !, __ : (__=ltop > __ <<, =lrb <<- (-RRB-=rrb > (__ > __=rtop)) !<< (-RRB- !== =rrb))"),
// uncensor "fucked"
TregexPattern.compile("__=top <1 (__=f1 < f) <2 (__=f2 <... { (__ < /^[*\\\\]+$/) ; (__ < ed) })"),
// fix don ' t
TregexPattern.compile("__=top <1 (__=f1 <1 (__ < don=do) <2 (__ < /^[\']$/=apos)) <2 (__=wrong < t)"),
// parens at the start of a sentence - always appears wrong
TregexPattern.compile("-LRB-=lrb !, __ .. (-RRB-=rrb !< __ !.. -RRB-)"),
// parens with a single word that we can drop
TregexPattern.compile("-LRB-=lrb . and|Haneke|is|Evans|Harmon|Harris|its|it|Aniston|headbanger|Testud|but|frames|yet|Denis|DeNiro|sinks|screenwriter|Cho|meditation|Watts|that|the|this|Madonna|Ahola|Franco|Hopkins|Crudup|writer-director|Diggs|very|Crane|Frei|Reno|Jones|Quills|Bobby|Hill|Kim|subjects|Wang|Jaglom|Vega|Sabara|Sade|Goldbacher|too|being|opening=last : (=last . -RRB-=rrb)"),
// parens with two word expressions
TregexPattern.compile("-LRB-=lrb . (__=n1 !< __ . (__=n2 !< __ . -RRB-=rrb)) : (=n1 (== Besson|Kissinger|Godard|Seagal|jaglon|It|it|Tsai|Nelson|Rifkan|Shakespeare|Solondz|Madonna|Herzog|Witherspoon|Woo|Eyre|there|Moore|Ricci|Seinfeld . (=n2 == /^\'s$/)) | (== Denis|Skins|Spears|Assayas . (=n2 == /^\'$/)) | (== Je-Gyu . (=n2 == is)) | (== the . (=n2 == leads|film|story|characters)) | (== Monsoon . (=n2 == Wedding)) | (== De . (=n2 == Niro)) | (== Roman . (=n2 == Coppola)) | (== than . (=n2 == Leon)) | (==Colgate . (=n2 == /^U.$/)) | (== teen . (=n2 == comedy)) | (== a . (=n2 == remake)) | (== Powerpuff . (=n2 == Girls)) | (== Woody . (=n2 == Allen)))"),
// parens with three word expressions
TregexPattern.compile("-LRB-=lrb . (__=n1 !< __ . (__=n2 !< __ . (__=n3 !< __ . -RRB-=rrb))) : (=n1 [ (== the . (=n2 == characters . (=n3 == /^\'$/))) | (== the . (=n2 == movie . (=n3 == /^\'s$/))) | (== of . (=n2 == middle-aged . (=n3 == romance))) | (== Jack . (=n2 == Nicholson . (=n3 == /^\'s$/))) | (== De . (=n2 == Palma . (=n3 == /^\'s$/))) | (== Clara . (=n2 == and . (=n3 == Paul))) | (== Sex . (=n2 == and . (=n3 == Lucía))) ])"),
// only one of these, so can be very general
TregexPattern.compile("/^401$/ > (__ > __=top)"),
TregexPattern.compile("by . (all > (__=all > __=allgp) . (means > (__=means > __=meansgp))) : (=allgp !== =meansgp)"),
// 20th century, 21st century
TregexPattern.compile("/^(?:20th|21st)$/ . Century=century"),
// Fix any stranded unitary nodes
TregexPattern.compile("__ <: (__=unitary < __)"),
// relabel some nodes where punctuation changes the score for no apparent reason
// TregexPattern.compile("__=node <2 (__ < /^[!.?,;]$/) !<1 ~node <1 __=child > ~child"),
// TODO: relabel words in some less expensive way?
TregexPattern.compile("/^[1]$/=label <: /^(?i:protagonist)$/"),
};
static final TsurgeonPattern[] tsurgeonPatterns = {
Tsurgeon.parseOperation("[relabel word /^.*$/={word}={letter}/] [prune single] [excise useless useless]"),
Tsurgeon.parseOperation("[relabel word /^.*$/={letter}n/] [prune single] [excise useless useless]"),
Tsurgeon.parseOperation("[excise single A] [prune paren]"),
Tsurgeon.parseOperation("[relabel provide /^.*$/={provide}s/] [prune s] [excise useless useless]"),
Tsurgeon.parseOperation("[relabel word /^.*$/={letter}={word}/] [prune single] [excise useless useless]"),
Tsurgeon.parseOperation("[prune lrb] [prune rrb] [excise ltop ltop] [excise rtop rtop]"),
Tsurgeon.parseOperation("replace top (0 fucked)"),
Tsurgeon.parseOperation("[prune wrong] [relabel do do] [relabel apos /^.*$/n={apos}t/] [excise top top]"),
// Note: the next couple leave unitary nodes, so we then fix them at the end
Tsurgeon.parseOperation("[prune rrb] [prune lrb]"),
Tsurgeon.parseOperation("[prune rrb] [prune lrb]"),
Tsurgeon.parseOperation("[prune rrb] [prune lrb]"),
Tsurgeon.parseOperation("[prune rrb] [prune lrb]"),
Tsurgeon.parseOperation("replace top (2 (2 401k) (2 statement))"),
Tsurgeon.parseOperation("[move means $- all] [excise meansgp meansgp] [createSubtree 2 all means]"),
Tsurgeon.parseOperation("relabel century century"),
// Fix any stranded unitary nodes
Tsurgeon.parseOperation("[excise unitary unitary]"),
//Tsurgeon.parseOperation("relabel node /^.*$/={child}/"),
Tsurgeon.parseOperation("relabel label /^.*$/2/"),
};
static {
if (tregexPatterns.length != tsurgeonPatterns.length) {
throw new RuntimeException("Expected the same number of tregex and tsurgeon when initializing");
}
}
private ReadSentimentDataset() {} // static class
public static Tree convertTree(List<Integer> parentPointers, List<String> sentence, Map<List<String>, Integer> phraseIds, Map<Integer, Double> sentimentScores, PTBEscapingProcessor escaper) {
int maxNode = 0;
for (Integer parent : parentPointers) {
maxNode = Math.max(maxNode, parent);
}
Tree[] subtrees = new Tree[maxNode + 1];
for (int i = 0; i < sentence.size(); ++i) {
CoreLabel word = new CoreLabel();
word.setValue(sentence.get(i));
Tree leaf = new LabeledScoredTreeNode(word);
subtrees[i] = new LabeledScoredTreeNode(new CoreLabel());
subtrees[i].addChild(leaf);
}
for (int i = sentence.size(); i <= maxNode; ++i) {
subtrees[i] = new LabeledScoredTreeNode(new CoreLabel());
}
boolean[] connected = new boolean[maxNode + 1];
Tree root = null;
for (int index = 0; index < parentPointers.size(); ++index) {
if (parentPointers.get(index) == -1) {
if (root != null) {
throw new RuntimeException("Found two roots for sentence " + sentence);
}
root = subtrees[index];
} else {
// Walk up the tree structure to make sure that leftmost
// phrases are added first. Otherwise, if the numbers are
// inverted, we might get the right phrase added to a parent
// first, resulting in "case zero in this", for example,
// instead of "in this case zero"
// Note that because we keep track of which ones are already
// connected, we process this at most once per parent, so the
// overall construction time is still efficient.
connect(parentPointers, subtrees, connected, index);
}
}
for (int i = 0; i <= maxNode; ++i) {
List<Tree> leaves = subtrees[i].getLeaves();
List<String> words = CollectionUtils.transformAsList(leaves, TRANSFORM_TREE_TO_WORD);
// First we look for a copy of the phrase with -LRB- -RRB-
// instead of (). The sentiment trees sometimes have both, and
// the escaped versions seem to have more reasonable scores.
// If a particular phrase doesn't have -LRB- -RRB- we fall back
// to the unescaped versions.
Integer phraseId = phraseIds.get(CollectionUtils.transformAsList(words, TRANSFORM_PARENS));
if (phraseId == null) {
phraseId = phraseIds.get(words);
}
if (phraseId == null) {
throw new RuntimeException("Could not find phrase id for phrase " + sentence);
}
// TODO: should we make this an option? Perhaps we want cases
// where the trees have the phrase id and not their class
Double score = sentimentScores.get(phraseId);
if (score == null) {
throw new RuntimeException("Could not find sentiment score for phrase id " + phraseId);
}
// TODO: make this a numClasses option
int classLabel = Math.round((float) Math.floor(score * 5.0));
if (classLabel > 4) {
classLabel = 4;
}
subtrees[i].label().setValue(Integer.toString(classLabel));
}
for (int i = 0; i < sentence.size(); ++i) {
Tree leaf = subtrees[i].children()[0];
leaf.label().setValue(escaper.escapeString(leaf.label().value()));
}
for (int i = 0; i < tregexPatterns.length; ++i) {
root = Tsurgeon.processPattern(tregexPatterns[i], tsurgeonPatterns[i], root);
}
return root;
}
private static void connect(List<Integer> parentPointers, Tree[] subtrees, boolean[] connected, int index) {
if (connected[index]) {
return;
}
if (parentPointers.get(index) < 0) {
return;
}
subtrees[parentPointers.get(index)].addChild(subtrees[index]);
connected[index] = true;
connect(parentPointers, subtrees, connected, parentPointers.get(index));
}
private static void writeTrees(String filename, List<Tree> trees, List<Integer> treeIds) {
try {
FileOutputStream fos = new FileOutputStream(filename);
BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(fos));
for (Integer id : treeIds) {
bout.write(trees.get(id).toString());
bout.write("\n");
}
bout.flush();
fos.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* This program converts the format of the Sentiment data set
* prepared by Richard, Jean, etc. into trees readable with the
* normal TreeReaders.
* <br>
* An example command line is
* <br>
* <code>java edu.stanford.nlp.sentiment.ReadSentimentDataset -dictionary stanfordSentimentTreebank/dictionary.txt -sentiment stanfordSentimentTreebank/sentiment_labels.txt -tokens stanfordSentimentTreebank/SOStr.txt -parse stanfordSentimentTreebank/STree.txt -split stanfordSentimentTreebank/datasetSplit.txt -train train.txt -dev dev.txt -test test.txt</code>
* <br>
* The arguments are as follows: <br>
* <code>-dictionary</code>, <code>-sentiment</code>,
* <code>-tokens</code>, <code>-parse</code>, <code>-split</code>
* Path to the corresponding files from the dataset <br>
* <code>-train</code>, <code>-dev</code>, <code>-test</code>
* Paths for saving the corresponding output files <br>
* Each of these arguments is required.
* <br>
* Macro arguments exist in -inputDir and -outputDir, so you can for example run <br>
* <code>java edu.stanford.nlp.sentiment.ReadSentimentDataset -inputDir ../data/sentiment/stanfordSentimentTreebank -outputDir .</code>
*/
public static void main(String[] args) {
String dictionaryFilename = null;
String sentimentFilename = null;
String tokensFilename = null;
String parseFilename = null;
String splitFilename = null;
String trainFilename = null;
String devFilename = null;
String testFilename = null;
int argIndex = 0;
while (argIndex < args.length) {
if (args[argIndex].equalsIgnoreCase("-dictionary")) {
dictionaryFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentiment")) {
sentimentFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tokens")) {
tokensFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parse")) {
parseFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-split")) {
splitFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-inputDir") ||
args[argIndex].equalsIgnoreCase("-inputDirectory")) {
dictionaryFilename = args[argIndex + 1] + "/dictionary.txt";
sentimentFilename = args[argIndex + 1] + "/sentiment_labels.txt";
tokensFilename = args[argIndex + 1] + "/SOStr.txt";
parseFilename = args[argIndex + 1] + "/STree.txt";
splitFilename = args[argIndex + 1] + "/datasetSplit.txt";
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-train")) {
trainFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-dev")) {
devFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-test")) {
testFilename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-outputDir") ||
args[argIndex].equalsIgnoreCase("-outputDirectory")) {
trainFilename = args[argIndex + 1] + "/train.txt";
devFilename = args[argIndex + 1] + "/dev.txt";
testFilename = args[argIndex + 1] + "/test.txt";
argIndex += 2;
} else {
System.err.println("Unknown argument " + args[argIndex]);
System.exit(2);
}
}
// Sentence file is formatted
// w1|w2|w3...
List<List<String>> sentences = Generics.newArrayList();
for (String line : IOUtils.readLines(tokensFilename, "utf-8")) {
String[] sentence = line.split("\\|");
sentences.add(Arrays.asList(sentence));
}
// Split and read the phrase ids file. This file is in the format
// w1 w2 w3 ... | id
Map<List<String>, Integer> phraseIds = Generics.newHashMap();
for (String line : IOUtils.readLines(dictionaryFilename, "utf-8")) {
String[] pieces = line.split("\\|");
String[] sentence = pieces[0].split(" ");
Integer id = Integer.valueOf(pieces[1]);
phraseIds.put(Arrays.asList(sentence), id);
}
// Split and read the sentiment scores file. Each line of this
// file is of the format:
// phrasenum | score
Map<Integer, Double> sentimentScores = Generics.newHashMap();
for (String line : IOUtils.readLines(sentimentFilename, "utf-8")) {
if (line.startsWith("phrase")) {
continue;
}
String[] pieces = line.split("\\|");
Integer id = Integer.valueOf(pieces[0]);
Double score = Double.valueOf(pieces[1]);
sentimentScores.put(id, score);
}
// Read lines from the tree structure file. This is a file of parent pointers for each tree.
int index = 0;
PTBEscapingProcessor escaper = new PTBEscapingProcessor();
List<Tree> trees = Generics.newArrayList();
for (String line : IOUtils.readLines(parseFilename, "utf-8")) {
String[] pieces = line.split("\\|");
List<Integer> parentPointers = CollectionUtils.transformAsList(Arrays.asList(pieces), arg -> Integer.valueOf(arg) - 1);
Tree tree = convertTree(parentPointers, sentences.get(index), phraseIds, sentimentScores, escaper);
++index;
trees.add(tree);
}
Map<Integer, List<Integer>> splits = Generics.newHashMap();
splits.put(1, Generics.<Integer>newArrayList());
splits.put(2, Generics.<Integer>newArrayList());
splits.put(3, Generics.<Integer>newArrayList());
for (String line : IOUtils.readLines(splitFilename, "utf-8")) {
if (line.startsWith("sentence_index")) {
continue;
}
String[] pieces = line.split(",");
Integer treeId = Integer.valueOf(pieces[0]) - 1;
Integer fileId = Integer.valueOf(pieces[1]);
splits.get(fileId).add(treeId);
}
writeTrees(trainFilename, trees, splits.get(1));
writeTrees(testFilename, trees, splits.get(2));
writeTrees(devFilename, trees, splits.get(3));
}
}