package edu.stanford.nlp.trees;
import edu.stanford.nlp.io.ExtensionFileFilter;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Sets;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.StringLabel;
import java.io.*;
import java.text.NumberFormat;
import java.util.AbstractCollection;
import java.util.Arrays;
import java.util.Set;
/**
* A <code>Treebank</code> object provides access to a corpus of examples with
* given tree structures.
* This class now implements the Collection interface. However, it may offer
* less than the full power of the Collection interface: some Treebanks are
* read only, and so may throw the UnsupportedOperationException.
*
* @author Christopher Manning
* @author Roger Levy (added encoding variable and method)
*/
public abstract class Treebank extends AbstractCollection<Tree> {
/**
* Stores the <code>TreeReaderFactory</code> that will be used to
* create a <code>TreeReader</code> to process a file of trees.
*/
private TreeReaderFactory trf;
/**
* Stores the charset encoding of the Treebank on disk.
*/
private String encoding = TreebankLanguagePack.DEFAULT_ENCODING;
public static final String DEFAULT_TREE_FILE_SUFFIX = "mrg";
/**
* Create a new Treebank (using a LabeledScoredTreeReaderFactory).
*/
public Treebank() {
this(new LabeledScoredTreeReaderFactory());
}
/**
* Create a new Treebank.
*
* @param trf the factory class to be called to create a new
* <code>TreeReader</code>
*/
public Treebank(TreeReaderFactory trf) {
this.trf = trf;
}
/**
* Create a new Treebank.
*
* @param trf the factory class to be called to create a new
* <code>TreeReader</code>
* @param encoding The charset encoding to use for treebank file decoding
*/
public Treebank(TreeReaderFactory trf, String encoding) {
this.trf = trf;
this.encoding = encoding;
}
/**
* Create a new Treebank.
*
* @param initialCapacity The initial size of the underlying Collection,
* (if a Collection-based storage mechanism is being provided)
*/
public Treebank(int initialCapacity) {
this(initialCapacity, new LabeledScoredTreeReaderFactory());
}
/**
* Create a new Treebank.
*
* @param initialCapacity The initial size of the underlying Collection,
* (if a Collection-based storage mechanism is being provided)
* @param trf the factory class to be called to create a new
* <code>TreeReader</code>
*/
@SuppressWarnings({"UnusedDeclaration"})
public Treebank(int initialCapacity, TreeReaderFactory trf) {
this.trf = trf;
}
/**
* Get the <code>TreeReaderFactory</code> for a <code>Treebank</code> --
* this method is provided in order to make the
* <code>TreeReaderFactory</code> available to subclasses.
*
* @return The TreeReaderFactory
*/
protected TreeReaderFactory treeReaderFactory() {
return trf;
}
/**
* Returns the encoding in use for treebank file bytestream access.
*
* @return The encoding in use for treebank file bytestream access.
*/
public String encoding() {
return encoding;
}
/**
* Empty a <code>Treebank</code>.
*/
@Override
public abstract void clear();
/**
* Load a sequence of trees from given directory and its subdirectories.
* Trees should reside in files with the suffix "mrg".
* Or: load a single file with the given pathName (including extension)
*
* @param pathName file or directory name
*/
public void loadPath(String pathName) {
loadPath(new File(pathName));
}
/**
* Load a sequence of trees from given file or directory and its subdirectories.
* Either this loads from a directory (tree) and
* trees must reside in files with the suffix "mrg" (this is an English
* Penn Treebank holdover!),
* or it loads a single file with the given path (including extension)
*
* @param path File specification
*/
public void loadPath(File path) {
loadPath(path, DEFAULT_TREE_FILE_SUFFIX, true);
}
/**
* Load trees from given directory.
*
* @param pathName File or directory name
* @param suffix Extension of files to load: If <code>pathName</code>
* is a directory, then, if this is
* non-<code>null</code>, all and only files ending in "." followed
* by this extension will be loaded; if it is <code>null</code>,
* all files in directories will be loaded. If <code>pathName</code>
* is not a directory, this parameter is ignored.
* @param recursively descend into subdirectories as well
*/
public void loadPath(String pathName, String suffix, boolean recursively) {
loadPath(new File(pathName), new ExtensionFileFilter(suffix, recursively));
}
/**
* Load trees from given directory.
*
* @param path file or directory to load from
* @param suffix suffix of files to load
* @param recursively descend into subdirectories as well
*/
public void loadPath(File path, String suffix, boolean recursively) {
loadPath(path, new ExtensionFileFilter(suffix, recursively));
}
/**
* Load a sequence of trees from given directory and its subdirectories
* which match the file filter.
* Or: load a single file with the given pathName (including extension)
*
* @param pathName file or directory name
* @param filt A filter used to determine which files match
*/
public void loadPath(String pathName, FileFilter filt) {
loadPath(new File(pathName), filt);
}
/**
* Load trees from given path specification.
*
* @param path file or directory to load from
* @param filt a FilenameFilter of files to load
*/
public abstract void loadPath(File path, FileFilter filt);
/**
* Apply a TreeVisitor to each tree in the Treebank.
* For all current implementations of Treebank, this is the fastest
* way to traverse all the trees in the Treebank.
*
* @param tp The TreeVisitor to be applied
*/
public abstract void apply(TreeVisitor tp);
/**
* Return a Treebank (actually a TransformingTreebank) where each
* Tree in the current treebank has been transformed using the
* TreeTransformer. The argument Treebank is unchanged (assuming
* that the TreeTransformer correctly doesn't change input Trees).
*
* @param treeTrans The TreeTransformer to use
* @return A Treebank (actually a TransformingTreebank) where each
* Tree in the current treebank has been transformed using the
* TreeTransformer.
*/
public Treebank transform(TreeTransformer treeTrans) {
return new TransformingTreebank(this, treeTrans);
}
/**
* Return the whole treebank as a series of big bracketed lists.
* Calling this is a really bad idea if your treebank is large.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
apply(t -> {
sb.append(t.toString());
sb.append('\n');
});
return sb.toString();
}
private static final class CounterTreeProcessor implements TreeVisitor {
int i; // = 0;
public void visitTree(Tree t) {
i++;
}
public int total() {
return i;
}
}
/**
* Returns the size of the Treebank.
*
* @return size How many trees are in the treebank
*/
@Override
public int size() {
CounterTreeProcessor counter = new CounterTreeProcessor();
apply(counter);
return counter.total();
}
/** Divide a Treebank into 3, by taking every 9th sentence for the dev
* set and every 10th for the test set. Penn people do this.
*/
public void decimate(Writer trainW, Writer devW, Writer testW) throws IOException {
PrintWriter trainPW = new PrintWriter(trainW, true);
PrintWriter devPW = new PrintWriter(devW, true);
PrintWriter testPW = new PrintWriter(testW, true);
int i = 0;
for (Tree t : this) {
if (i == 8) {
t.pennPrint(devPW);
} else if (i == 9) {
t.pennPrint(testPW);
} else {
t.pennPrint(trainPW);
}
i = (i+1) % 10;
}
}
/**
* Return various statistics about the treebank (number of sentences,
* words, tag set, etc.).
*
* @return A String with various statistics about the treebank (number of
* sentences, words, tag set, etc.).
*/
public String textualSummary() {
return textualSummary(null);
}
/**
* Return various statistics about the treebank (number of sentences,
* words, tag set, etc.).
*
* @param tlp The TreebankLanguagePack used to determine punctuation and an
* appropriate character encoding
* @return A big string for human consumption describing the treebank
*/
public String textualSummary(TreebankLanguagePack tlp) {
int numTrees = 0;
int numTreesLE40 = 0;
int numNonUnaryRoots = 0;
Tree nonUnaryEg = null;
ClassicCounter<Tree> nonUnaries = new ClassicCounter<Tree>();
ClassicCounter<String> roots = new ClassicCounter<String>();
ClassicCounter<String> starts = new ClassicCounter<String>();
ClassicCounter<String> puncts = new ClassicCounter<String>();
int numUnenclosedLeaves = 0;
int numLeaves = 0;
int numNonPhrasal = 0;
int numPreTerminalWithMultipleChildren = 0;
int numWords = 0;
int numTags = 0;
int shortestSentence = Integer.MAX_VALUE;
int longestSentence = 0;
int numNullLabel = 0;
Set<String> words = Generics.newHashSet();
ClassicCounter<String> tags = new ClassicCounter<String>();
ClassicCounter<String> cats = new ClassicCounter<String>();
Tree leafEg = null;
Tree preTerminalMultipleChildrenEg = null;
Tree nullLabelEg = null;
Tree rootRewritesAsTaggedWordEg = null;
for (Tree t : this) {
roots.incrementCount(t.value());
numTrees++;
int leng = t.yield().size();
if (leng <= 40) {
numTreesLE40++;
}
if (leng < shortestSentence) {
shortestSentence = leng;
}
if (leng > longestSentence) {
longestSentence = leng;
}
if (t.numChildren() > 1) {
if (numNonUnaryRoots == 0) {
nonUnaryEg = t;
}
if (numNonUnaryRoots < 100) {
nonUnaries.incrementCount(t.localTree());
}
numNonUnaryRoots++;
} else if (t.isLeaf()) {
numUnenclosedLeaves++;
} else {
Tree t2 = t.firstChild();
if (t2.isLeaf()) {
numLeaves++;
leafEg = t;
} else if (t2.isPreTerminal()) {
if (numNonPhrasal == 0) {
rootRewritesAsTaggedWordEg = t;
}
numNonPhrasal++;
}
starts.incrementCount(t2.value());
}
for (Tree subtree : t) {
Label lab = subtree.label();
if (lab == null || lab.value() == null || "".equals(lab.value())) {
if (numNullLabel == 0) {
nullLabelEg = subtree;
}
numNullLabel++;
if (lab == null) {
subtree.setLabel(new StringLabel(""));
} else if (lab.value() == null) {
subtree.label().setValue("");
}
}
if (subtree.isLeaf()) {
numWords++;
words.add(subtree.value());
} else if (subtree.isPreTerminal()) {
numTags++;
tags.incrementCount(subtree.value());
if (tlp != null && tlp.isPunctuationTag(subtree.value())) {
puncts.incrementCount(subtree.firstChild().value());
}
} else if (subtree.isPhrasal()) {
boolean hasLeafChild = false;
for (Tree kt : subtree.children()) {
if (kt.isLeaf()) {
hasLeafChild = true;
}
}
if (hasLeafChild) {
numPreTerminalWithMultipleChildren++;
if (preTerminalMultipleChildrenEg == null) {
preTerminalMultipleChildrenEg = subtree;
}
}
cats.incrementCount(subtree.value());
} else {
throw new IllegalStateException("Treebank: Bad tree in treebank!: " + subtree);
}
}
}
StringWriter sw = new StringWriter(2000);
PrintWriter pw = new PrintWriter(sw);
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(0);
pw.println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
if (numTrees > 0) {
if (numTags != numWords) {
pw.println(" Warning! numTags differs and is " + numTags);
}
if (roots.size() == 1) {
String root = (String) roots.keySet().toArray()[0];
pw.println(" The root category is: " + root);
} else {
pw.println(" Warning! " + roots.size() + " different roots in treebank: " + Counters.toString(roots, nf));
}
if (numNonUnaryRoots > 0) {
pw.print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. ");
if (numNonUnaryRoots > 100) {
pw.print("First 100 ");
}
pw.println("Rewrites: " + Counters.toString(nonUnaries, nf));
pw.println(" Example: " + nonUnaryEg);
}
if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) {
pw.println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
if (numLeaves > 0) {
pw.println(" Example bad root rewrites as leaf: " + leafEg);
}
if (numNonPhrasal > 0) {
pw.println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
}
}
if (numNullLabel > 0) {
pw.println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
pw.println(" " + nullLabelEg);
}
if (numPreTerminalWithMultipleChildren > 0) {
pw.println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
pw.println(" Example: " + preTerminalMultipleChildrenEg);
}
pw.println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
pw.println(" " + cats.size() + " phrasal category types, " + tags.size() + " tag types, and " + words.size() + " word types");
String[] empties = {"*", "0", "*T*", "*RNR*", "*U*",
"*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*",
"*OP*", "*pro*", "*PRO*"};
// What a dopey choice using 0 as an empty element name!!
// The problem with the below is that words aren't turned into a basic
// category, but empties commonly are indexed.... Would need to look
// for them with a suffix of -[0-9]+
Set<String> knownEmpties = Generics.newHashSet(Arrays.asList(empties));
Set<String> emptiesIntersection = Sets.intersection(words, knownEmpties);
if ( ! emptiesIntersection.isEmpty()) {
pw.println(" Caution! " + emptiesIntersection.size() +
" word types are known empty elements: " +
emptiesIntersection);
}
Set<String> joint = Sets.intersection(cats.keySet(), tags.keySet());
if ( ! joint.isEmpty()) {
pw.println(" Warning! " + joint.size() + " items are tags and categories: " + joint);
}
for (String cat : cats.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat);
break;
}
}
for (String cat : tags.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat);
break;
}
}
pw.println(" Cats: " + Counters.toString(cats, nf));
pw.println(" Tags: " + Counters.toString(tags, nf));
pw.println(" " + starts.size() + " start categories: " + Counters.toString(starts, nf));
if ( ! puncts.isEmpty()) {
pw.println(" Puncts: " + Counters.toString(puncts, nf));
}
}
return sw.toString();
}
/**
* This operation isn't supported for a Treebank. Tell them immediately.
*/
@Override
public boolean remove(Object o) {
throw new UnsupportedOperationException("Treebank is read-only");
}
}