package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.parser.metrics.AbstractEval;
import edu.stanford.nlp.parser.tools.PunctEquivalenceClasser;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.stats.EquivalenceClasser;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* An abstract class providing a common method base from which to
* complete a {@code TreebankLangParserParams} implementing class.
* <p/>
* With some extending classes you'll want to have access to special
* attributes of the corresponding TreebankLanguagePack while taking
* advantage of this class's code for making the TreebankLanguagePack
* accessible. A good way to do this is to pass a new instance of the
* appropriate TreebankLanguagePack into this class's constructor,
* then get it back later on by casting a call to
* treebankLanguagePack(). See ChineseTreebankParserParams for an
* example.
*
* @author Roger Levy
*/
public abstract class AbstractTreebankParserParams implements TreebankLangParserParams {
/**
* If true, then evaluation is over grammatical functions as well as the labels
* If false, then grammatical functions are stripped for evaluation. This really
* only makes sense if you've trained with grammatical functions but want to evaluate without them.
*/
protected boolean evalGF = true;
/** The job of this class is to remove subcategorizations from
* tag and category nodes, so as to put a tree in a suitable
* state for evaluation. Providing the TreebankLanguagePack
* is defined correctly, this should work for any language.
*/
protected class SubcategoryStripper implements TreeTransformer {
protected TreeFactory tf = new LabeledScoredTreeFactory();
@Override
public Tree transformTree(Tree tree) {
Label lab = tree.label();
if (tree.isLeaf()) {
Tree leaf = tf.newLeaf(lab);
leaf.setScore(tree.score());
return leaf;
}
String s = lab.value();
s = treebankLanguagePack().basicCategory(s);
int numKids = tree.numChildren();
List<Tree> children = new ArrayList<Tree>(numKids);
for (int cNum = 0; cNum < numKids; cNum++) {
Tree child = tree.getChild(cNum);
Tree newChild = transformTree(child);
// cdm 2007: for just subcategory stripping, null shouldn't happen
// if (newChild != null) {
children.add(newChild);
// }
}
// if (children.isEmpty()) {
// return null;
// }
CategoryWordTag newLabel = new CategoryWordTag(lab);
newLabel.setCategory(s);
if (lab instanceof HasTag) {
String tag = ((HasTag) lab).tag();
tag = treebankLanguagePack().basicCategory(tag);
newLabel.setTag(tag);
}
Tree node = tf.newTreeNode(newLabel, children);
node.setScore(tree.score());
return node;
}
} // end class SubcategoryStripper
/** The job of this class is to remove subcategorizations from
* tag and category nodes, so as to put a tree in a suitable
* state for evaluation. Providing the TreebankLanguagePack
* is defined correctly, this should work for any language.
* Very simililar to subcategory stripper, but strips grammatical
* functions as well.
*/
protected class RemoveGFSubcategoryStripper implements TreeTransformer {
protected TreeFactory tf = new LabeledScoredTreeFactory();
@Override
public Tree transformTree(Tree tree) {
Label lab = tree.label();
if (tree.isLeaf()) {
Tree leaf = tf.newLeaf(lab);
leaf.setScore(tree.score());
return leaf;
}
String s = lab.value();
s = treebankLanguagePack().basicCategory(s);
s = treebankLanguagePack().stripGF(s);
int numKids = tree.numChildren();
List<Tree> children = new ArrayList<Tree>(numKids);
for (int cNum = 0; cNum < numKids; cNum++) {
Tree child = tree.getChild(cNum);
Tree newChild = transformTree(child);
children.add(newChild);
}
CategoryWordTag newLabel = new CategoryWordTag(lab);
newLabel.setCategory(s);
if (lab instanceof HasTag) {
String tag = ((HasTag) lab).tag();
tag = treebankLanguagePack().basicCategory(tag);
tag = treebankLanguagePack().stripGF(tag);
newLabel.setTag(tag);
}
Tree node = tf.newTreeNode(newLabel, children);
node.setScore(tree.score());
return node;
}
} // end class RemoveGFSubcategoryStripper
protected String inputEncoding;
protected String outputEncoding;
protected TreebankLanguagePack tlp;
/**
* Stores the passed-in TreebankLanguagePack and sets up charset encodings.
*
* @param tlp The treebank language pack to use
*/
protected AbstractTreebankParserParams(TreebankLanguagePack tlp) {
this.tlp = tlp;
inputEncoding = tlp.getEncoding();
outputEncoding = tlp.getEncoding();
}
@Override
public Label processHeadWord(Label headWord) {
return headWord;
}
/**
* Sets whether to consider grammatical functions in evaluation
*/
@Override
public void setEvaluateGrammaticalFunctions(boolean evalGFs) {
this.evalGF = evalGFs;
}
/**
* Sets the input encoding.
*/
@Override
public void setInputEncoding(String encoding) {
inputEncoding = encoding;
}
/**
* Sets the output encoding.
*/
@Override
public void setOutputEncoding(String encoding) {
outputEncoding = encoding;
}
/**
* Returns the output encoding being used.
*/
@Override
public String getOutputEncoding() {
return outputEncoding;
}
/**
* Returns the input encoding being used.
*/
@Override
public String getInputEncoding() {
return inputEncoding;
}
/**
* Returns a language specific object for evaluating PP attachment
*
* @return An object that implements {@link AbstractEval}
*/
@Override
public AbstractEval ppAttachmentEval() {
return null;
}
/**
* returns a MemoryTreebank appropriate to the treebank source
*/
@Override
public abstract MemoryTreebank memoryTreebank();
/**
* returns a DiskTreebank appropriate to the treebank source
*/
@Override
public abstract DiskTreebank diskTreebank();
/**
* You can often return the same thing for testMemoryTreebank as
* for memoryTreebank
*/
@Override
public MemoryTreebank testMemoryTreebank() {
return memoryTreebank();
}
/**
* Implemented as required by TreebankFactory. Use diskTreebank() instead.
*/
@Override
public Treebank treebank() {
return diskTreebank();
}
/**
* The PrintWriter used to print output. It's the responsibility of
* pw to deal properly with character encodings for the relevant
* treebank.
*/
@Override
public PrintWriter pw() {
return pw(System.out);
}
/**
* The PrintWriter used to print output. It's the responsibility of
* pw to deal properly with character encodings for the relevant
* treebank.
*/
@Override
public PrintWriter pw(OutputStream o) {
String encoding = outputEncoding;
if (!java.nio.charset.Charset.isSupported(encoding)) {
System.err.println("Warning: desired encoding " + encoding + " not accepted. ");
System.err.println("Using UTF-8 to construct PrintWriter");
encoding = "UTF-8";
}
//System.err.println("TreebankParserParams.pw(): encoding is " + encoding);
try {
return new PrintWriter(new OutputStreamWriter(o, encoding), true);
} catch (UnsupportedEncodingException e) {
System.err.println("Warning: desired encoding " + outputEncoding + " not accepted. " + e);
try {
return new PrintWriter(new OutputStreamWriter(o, "UTF-8"), true);
} catch (UnsupportedEncodingException e1) {
System.err.println("Something is really wrong. Your system doesn't even support UTF-8!" + e1);
return new PrintWriter(o, true);
}
}
}
/**
* Returns an appropriate treebankLanguagePack
*/
@Override
public TreebankLanguagePack treebankLanguagePack() {
return tlp;
}
/**
* The HeadFinder to use for your treebank.
*/
@Override
public abstract HeadFinder headFinder();
/**
* The HeadFinder to use when extracting typed dependencies.
*/
@Override
public abstract HeadFinder typedDependencyHeadFinder();
@Override
public Lexicon lex(Options op, Index<String> wordIndex, Index<String> tagIndex) {
return new BaseLexicon(op, wordIndex, tagIndex);
}
/**
* Give the parameters for smoothing in the MLEDependencyGrammar.
* Defaults are the ones previously hard coded into MLEDependencyGrammar.
* @return an array of doubles with smooth_aT_hTWd, smooth_aTW_hTWd, smooth_stop, and interp
*/
@Override
public double[] MLEDependencyGrammarSmoothingParams() {
return new double[] { 16.0, 16.0, 4.0, 0.6 };
}
/**
* Takes a Tree and a collinizer and returns a Collection of labeled
* {@link Constituent}s for PARSEVAL.
* @param t The tree to extract constituents from
* @param collinizer The TreeTransformer used to normalize the tree for
* evaluation
* @return The bag of Constituents for PARSEVAL.
*/
public static Collection<Constituent> parsevalObjectify(Tree t, TreeTransformer collinizer) {
return parsevalObjectify(t,collinizer,true);
}
/**
* Takes a Tree and a collinizer and returns a Collection of {@link Constituent}s for
* PARSEVAL evaluation. Some notes on this particular parseval:
* <ul>
* <li> It is character-based, which allows it to be used on segmentation/parsing combination evaluation.
* <li> whether it gives you labeled or unlabeled bracketings depends on the value of the <code>labelConstituents</code>
* parameter
* </ul>
*
* (Note that I haven't checked this rigorously yet with the PARSEVAL definition
* -- Roger.)
*/
public static Collection<Constituent> parsevalObjectify(Tree t, TreeTransformer collinizer, boolean labelConstituents) {
Collection<Constituent> spans = new ArrayList<Constituent>();
Tree t1 = collinizer.transformTree(t);
if (t1 == null) {
return spans;
}
for (Tree node : t1) {
if (node.isLeaf() || node.isPreTerminal() || (node != t1 && node.parent(t1) == null)) {
continue;
}
int leftEdge = t1.leftCharEdge(node);
int rightEdge = t1.rightCharEdge(node);
if(labelConstituents)
spans.add(new LabeledConstituent(leftEdge, rightEdge, node.label()));
else
spans.add(new SimpleConstituent(leftEdge, rightEdge));
}
return spans;
}
/**
* Returns a collection of untyped word-word dependencies for the tree.
*/
public static Collection<List<String>> untypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
return dependencyObjectify(t, hf, collinizer, new UntypedDependencyTyper(hf));
}
/**
* Returns a collection of unordered (but directed!) untyped word-word dependencies for the tree.
*/
public static Collection<List<String>> unorderedUntypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
return dependencyObjectify(t, hf, collinizer, new UnorderedUntypedDependencyTyper(hf));
}
/**
* Returns a collection of word-word dependencies typed by mother, head, daughter node syntactic categories.
*/
public static Collection<List<String>> typedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
return dependencyObjectify(t, hf, collinizer, new TypedDependencyTyper(hf));
}
/**
* Returns a collection of unordered (but directed!) typed word-word dependencies for the tree.
*/
public static Collection<List<String>> unorderedTypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
return dependencyObjectify(t, hf, collinizer, new UnorderedTypedDependencyTyper(hf));
}
/**
* Returns the set of dependencies in a tree, according to some {@link edu.stanford.nlp.trees.DependencyTyper}.
*/
public static <E> Collection<E> dependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer, DependencyTyper<E> typer) {
Collection<E> deps = new ArrayList<E>();
Tree t1 = collinizer.transformTree(t);
if(t1==null)
return deps;
dependencyObjectifyHelper(t1, t1, hf, deps, typer);
return deps;
}
private static <E> void dependencyObjectifyHelper(Tree t, Tree root, HeadFinder hf, Collection<E> c, DependencyTyper<E> typer) {
if (t.isLeaf() || t.isPreTerminal()) {
return;
}
Tree headDtr = hf.determineHead(t);
for (Tree child : t.children()) {
dependencyObjectifyHelper(child, root, hf, c, typer);
if (child != headDtr) {
c.add(typer.makeDependency(headDtr, child, root));
}
}
}
private static class UntypedDependencyTyper implements DependencyTyper<List<String>> {
HeadFinder hf;
public UntypedDependencyTyper(HeadFinder hf) {
this.hf = hf;
}
@Override
public List<String> makeDependency(Tree head, Tree dep, Tree root) {
List<String> result = new ArrayList<String>(3);
Tree headTerm = head.headTerminal(hf);
Tree depTerm = dep.headTerminal(hf);
boolean headLeft = root.leftCharEdge(headTerm) < root.leftCharEdge(depTerm);
result.add(headTerm.value());
result.add(depTerm.value());
if(headLeft)
result.add(leftHeaded);
else
result.add(rightHeaded);
return result;
}
}
private static class UnorderedUntypedDependencyTyper implements DependencyTyper<List<String>> {
HeadFinder hf;
public UnorderedUntypedDependencyTyper(HeadFinder hf) {
this.hf = hf;
}
@Override
public List<String> makeDependency(Tree head, Tree dep, Tree root) {
List<String> result = new ArrayList<String>(3);
Tree headTerm = head.headTerminal(hf);
Tree depTerm = dep.headTerminal(hf);
result.add(headTerm.value());
result.add(depTerm.value());
return result;
}
}
private static final String leftHeaded = "leftHeaded";
private static final String rightHeaded = "rightHeaded";
private static class TypedDependencyTyper implements DependencyTyper<List<String>> {
HeadFinder hf;
public TypedDependencyTyper(HeadFinder hf) {
this.hf = hf;
}
@Override
public List<String> makeDependency(Tree head, Tree dep, Tree root) {
List<String> result = new ArrayList<String>(6);
Tree headTerm = head.headTerminal(hf);
Tree depTerm = dep.headTerminal(hf);
boolean headLeft = root.leftCharEdge(headTerm) < root.leftCharEdge(depTerm);
result.add(headTerm.value());
result.add(depTerm.value());
result.add(head.parent(root).value());
result.add(head.value());
result.add(dep.value());
if(headLeft)
result.add(leftHeaded);
else
result.add(rightHeaded);
return result;
}
}
private static class UnorderedTypedDependencyTyper implements DependencyTyper<List<String>> {
HeadFinder hf;
public UnorderedTypedDependencyTyper(HeadFinder hf) {
this.hf = hf;
}
@Override
public List<String> makeDependency(Tree head, Tree dep, Tree root) {
List<String> result = new ArrayList<String>(6);
Tree headTerm = head.headTerminal(hf);
Tree depTerm = dep.headTerminal(hf);
result.add(headTerm.value());
result.add(depTerm.value());
result.add(head.parent(root).value());
result.add(head.value());
result.add(dep.value());
return result;
}
}
/** Returns an EquivalenceClasser that classes typed dependencies
* by the syntactic categories of mother, head and daughter,
* plus direction.
*
* @return An Equivalence class for typed dependencies
*/
public static EquivalenceClasser<List<String>, String> typedDependencyClasser() {
return s -> {
if(s.get(5).equals(leftHeaded))
return s.get(2) + '(' + s.get(3) + "->" + s.get(4) + ')';
return s.get(2) + '(' + s.get(4) + "<-" + s.get(3) + ')';
};
}
/**
* the tree transformer used to produce trees for evaluation. Will
* be applied both to the parse output tree and to the gold
* tree. Should strip punctuation and maybe do some other things.
*/
@Override
public abstract TreeTransformer collinizer();
/**
* the tree transformer used to produce trees for evaluation. Will
* be applied both to the parse output tree and to the gold
* tree. Should strip punctuation and maybe do some other
* things. The evalb version should strip some more stuff
* off. (finish this doc!)
*/
@Override
public abstract TreeTransformer collinizerEvalb();
/**
* Returns the splitting strings used for selective splits.
*
* @return An array containing ancestor-annotated Strings: categories
* should be split according to these ancestor annotations.
*/
@Override
public abstract String[] sisterSplitters();
/**
* Returns a TreeTransformer appropriate to the Treebank which
* can be used to remove functional tags (such as "-TMP") from
* categories. Removes GFs if evalGF = false; if GFs were not used
* in training, results are equivalent.
*/
@Override
public TreeTransformer subcategoryStripper() {
if(evalGF)
return new SubcategoryStripper();
return new RemoveGFSubcategoryStripper();
}
/**
* This method does language-specific tree transformations such
* as annotating particular nodes with language-relevant features.
* Such parameterizations should be inside the specific
* TreebankLangParserParams class. This method is recursively
* applied to each node in the tree (depth first, left-to-right),
* so you shouldn't write this method to apply recursively to tree
* members. This method is allowed to (and in some cases does)
* destructively change the input tree <code>t</code>. It changes both
* labels and the tree shape.
*
* @param t The input tree (with non-language specific annotation already
* done, so you need to strip back to basic categories)
* @param root The root of the current tree (can be null for words)
* @return The fully annotated tree node (with daughters still as you
* want them in the final result)
*/
@Override
public abstract Tree transformTree(Tree t, Tree root);
/**
* display language-specific settings
*/
@Override
public abstract void display();
/**
* Set language-specific options according to flags.
* This routine should process the option starting in args[i] (which
* might potentially be several arguments long if it takes arguments).
* It should return the index after the last index it consumed in
* processing. In particular, if it cannot process the current option,
* the return value should be i.
* <p>
* Generic options are processed separately by
* {@link Options#setOption(String[],int)},
* and implementations of this method do not have to worry about them.
* The Options class handles routing options.
* TreebankParserParams that extend this class should call super when
* overriding this method.
*/
@Override
public int setOptionFlag(String[] args, int i) {
return i;
}
private static final long serialVersionUID = 4299501909017975915L;
@Override
public TokenizerFactory<Tree> treeTokenizerFactory() {
return new TreeTokenizerFactory(treeReaderFactory());
}
@Override
public Extractor<DependencyGrammar> dependencyGrammarExtractor(Options op, Index<String> wordIndex, Index<String> tagIndex) {
return new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
}
public boolean isEvalGF() {
return evalGF;
}
public void setEvalGF(boolean evalGF) {
this.evalGF = evalGF;
}
/**
* Annotation function for mapping punctuation to PTB-style equivalence classes.
*
* @author Spence Green
*
*/
protected static class AnnotatePunctuationFunction implements SerializableFunction<TregexMatcher,String> {
private final String key;
private final String annotationMark;
public AnnotatePunctuationFunction(String annotationMark, String key) {
this.key = key;
this.annotationMark = annotationMark;
}
@Override
public String apply(TregexMatcher m) {
String punc = m.getNode(key).value();
String punctClass = PunctEquivalenceClasser.getPunctClass(punc);
return punctClass.equals("") ? "" : annotationMark + punctClass;
}
@Override
public String toString() { return "AnnotatePunctuationFunction"; }
private static final long serialVersionUID = 1L;
}
@Override
public List<GrammaticalStructure>
readGrammaticalStructureFromFile(String filename)
{
throw new UnsupportedOperationException("This language does not support GrammaticalStructures or dependencies");
}
@Override
public GrammaticalStructure getGrammaticalStructure(Tree t,
Predicate<String> filter,
HeadFinder hf) {
throw new UnsupportedOperationException("This language does not support GrammaticalStructures or dependencies");
}
/**
* By default, parsers are assumed to not support dependencies.
* Only English and Chinese do at present.
*/
@Override
public boolean supportsBasicDependencies() {
return false;
}
private static final String[] EMPTY_ARGS = new String[0];
@Override
public String[] defaultCoreNLPFlags() {
return EMPTY_ARGS;
}
}