Package edu.stanford.nlp.parser.tools

Source Code of edu.stanford.nlp.parser.tools.ManipulateTopBracket

package edu.stanford.nlp.parser.tools;

import java.io.PrintWriter;
import java.util.Collections;
import java.util.Map;
import java.util.Properties;

import edu.stanford.nlp.international.Languages;
import edu.stanford.nlp.international.Languages.Language;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;

/**
* Reads in a set of treebank files and either adds (default) or removes a top bracket.
*
* @author Spence Green
*
*/
public class ManipulateTopBracket {

  private static final int minArgs = 1;
  private static String usage() {
    StringBuilder usage = new StringBuilder();
    String nl = System.getProperty("line.separator");
    usage.append(String.format("Usage: java %s [OPTS] file(s) > bracketed_trees%n%n",ManipulateTopBracket.class.getName()));
    usage.append("Options:").append(nl);
    usage.append("  -v         : Verbose mode.").append(nl);
    usage.append("  -r         : Remove top bracket.").append(nl);
    usage.append("  -l lang    : Select language settings from " + Languages.listOfLanguages()).append(nl);
    usage.append("  -e enc     : Encoding.").append(nl);
    return usage.toString();
  }
  private static Map<String,Integer> argDefs() {
    Map<String,Integer> argDefs = Generics.newHashMap();
    argDefs.put("e", 1);
    argDefs.put("v", 0);
    argDefs.put("l", 1);
    argDefs.put("r", 0);
    return argDefs;
  }

  public static void main(String[] args) {
    if(args.length < minArgs) {
      System.out.println(usage());
      System.exit(-1);
    }

    Properties options = StringUtils.argsToProperties(args, argDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
    DiskTreebank tb = null;
    String encoding = options.getProperty("l", "UTF-8");
    boolean removeBracket = PropertiesUtils.getBool(options, "b", false);
   
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    tb = tlpp.diskTreebank();

    String[] files = options.getProperty("", "").split("\\s+");
    if (files.length != 0) {
      for (String filename : files) {
        tb.loadPath(filename);
      }
    } else {
      System.err.println(usage());
      System.exit(-1);
    }

    PrintWriter pwo = tlpp.pw();
    String startSymbol = tlpp.treebankLanguagePack().startSymbol();
    TreeFactory tf = new LabeledScoredTreeFactory();
    int nTrees = 0;
    for(Tree t : tb) {
      if(removeBracket) {
        if(t.value().equals(startSymbol)) {
          t = t.firstChild();
        }
       
      } else if( ! t.value().equals(startSymbol)) { //Add a bracket if it isn't already there
        t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
      }
      pwo.println(t.toString());
      nTrees++;
    }     
    pwo.close();
    System.err.printf("Processed %d trees.%n", nTrees);
  }
}
TOP

Related Classes of edu.stanford.nlp.parser.tools.ManipulateTopBracket

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.