package edu.stanford.nlp.parser.tools;
import java.io.PrintWriter;
import java.util.Collections;
import java.util.Map;
import java.util.Properties;
import edu.stanford.nlp.international.Languages;
import edu.stanford.nlp.international.Languages.Language;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
/**
* Reads in a set of treebank files and either adds (default) or removes a top bracket.
*
* @author Spence Green
*
*/
public class ManipulateTopBracket {
private static final int minArgs = 1;
private static String usage() {
StringBuilder usage = new StringBuilder();
String nl = System.getProperty("line.separator");
usage.append(String.format("Usage: java %s [OPTS] file(s) > bracketed_trees%n%n",ManipulateTopBracket.class.getName()));
usage.append("Options:").append(nl);
usage.append(" -v : Verbose mode.").append(nl);
usage.append(" -r : Remove top bracket.").append(nl);
usage.append(" -l lang : Select language settings from " + Languages.listOfLanguages()).append(nl);
usage.append(" -e enc : Encoding.").append(nl);
return usage.toString();
}
private static Map<String,Integer> argDefs() {
Map<String,Integer> argDefs = Generics.newHashMap();
argDefs.put("e", 1);
argDefs.put("v", 0);
argDefs.put("l", 1);
argDefs.put("r", 0);
return argDefs;
}
public static void main(String[] args) {
if(args.length < minArgs) {
System.out.println(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, argDefs());
Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
DiskTreebank tb = null;
String encoding = options.getProperty("l", "UTF-8");
boolean removeBracket = PropertiesUtils.getBool(options, "b", false);
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
String[] files = options.getProperty("", "").split("\\s+");
if (files.length != 0) {
for (String filename : files) {
tb.loadPath(filename);
}
} else {
System.err.println(usage());
System.exit(-1);
}
PrintWriter pwo = tlpp.pw();
String startSymbol = tlpp.treebankLanguagePack().startSymbol();
TreeFactory tf = new LabeledScoredTreeFactory();
int nTrees = 0;
for(Tree t : tb) {
if(removeBracket) {
if(t.value().equals(startSymbol)) {
t = t.firstChild();
}
} else if( ! t.value().equals(startSymbol)) { //Add a bracket if it isn't already there
t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
}
pwo.println(t.toString());
nTrees++;
}
pwo.close();
System.err.printf("Processed %d trees.%n", nTrees);
}
}