Package edu.stanford.nlp.parser.tools

Source Code of edu.stanford.nlp.parser.tools.RHSFrequency

package edu.stanford.nlp.parser.tools;

import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import edu.stanford.nlp.international.Languages;
import edu.stanford.nlp.international.Languages.Language;
import edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;

/**
* Create frequency distribution for RHS of grammar rules.
*
* @author Spence Green
*
*/
public class RHSFrequency {

  private static final int minArgs = 2;
  private static final StringBuilder usage = new StringBuilder();
  static {
    usage.append(String.format("Usage: java %s [OPTS] lhs tree_file \n\n",RHSFrequency.class.getName()));
    usage.append("Options:\n");
    usage.append("  -l lang    : Select language settings from " + Languages.listOfLanguages() + "\n");
    usage.append("  -e enc     : Encoding.\n");
  }

  public static void main(String[] args) {
    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
   
    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        rootMatch = TregexPattern.compile("@" + args[i++]);

        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i++]);
      }
    }

    Counter<String> rhsCounter = new ClassicCounter<String>();
    for(Tree t : tb) {
      TregexMatcher m = rootMatch.matcher(t);
      while(m.findNextMatchingNode()) {
        Tree match = m.getMatch();
        StringBuilder sb = new StringBuilder();
        for(Tree kid : match.children())
          sb.append(kid.value()).append(" ");
        rhsCounter.incrementCount(sb.toString().trim());
      }
    }

    List<String> biggestKeys = new ArrayList<String>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));

    PrintWriter pw = tlpp.pw();
    for(String rhs : biggestKeys)
      pw.printf("%s\t%d%n", rhs,(int) rhsCounter.getCount(rhs));
    pw.close();
  }
 
}
TOP

Related Classes of edu.stanford.nlp.parser.tools.RHSFrequency

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.