Package edu.stanford.nlp.parser.lexparser

Examples of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams$EnglishSubcategoryStripper


    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    int maxGuessYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    boolean skipGuess = false;
    boolean tagMode = false;
    String guessFile = null;
    String goldFile = null;

    for(int i = 0; i < args.length; i++) {

      if(args[i].startsWith("-")) {

        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-y":
            maxGoldYield = Integer.parseInt(args[++i].trim());

            break;
          case "-t":
            tagMode = true;

            break;
          case "-v":
            VERBOSE = true;

            break;
          case "-g":
            maxGuessYield = Integer.parseInt(args[++i].trim());
            skipGuess = true;

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        //Required parameters
        goldFile = args[i++];
        guessFile = args[i];
        break;
      }
    }

    final PrintWriter pwOut = tlpp.pw();

    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());

    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());

    final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
    final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);

    final TreeTransformer tc = tlpp.collinizer();

    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:
View Full Code Here


    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    Language lang = Language.English;

    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }

    PrintWriter pw = tlpp.pw();
    Options op = new Options();
    Options.LexOptions lexOptions = op.lexOptions;
    if(lang == Language.French) {
      lexOptions.useUnknownWordSignatures = 1;
      lexOptions.smartMutation = false;
      lexOptions.unknownSuffixSize = 2;
      lexOptions.unknownPrefixSize = 1;
    } else if(lang == Language.Arabic) {
      lexOptions.smartMutation = false;
      lexOptions.useUnknownWordSignatures = 9;
      lexOptions.unknownPrefixSize = 1;
      lexOptions.unknownSuffixSize = 1;
    }
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
   
    int computeAfter = (int) (0.50 * tb.size());
    Counter<String> vocab = new ClassicCounter<String>();
    Counter<String> unkCounter = new ClassicCounter<String>();
    int treeId = 0;
View Full Code Here

    case Spanish:
      tlpp = new SpanishTreebankParserParams();
      break;

    default:
      tlpp = new EnglishTreebankParserParams();
    }
    return tlpp;
  }
View Full Code Here

   * Run the Evalb scoring metric on guess/gold input. The default language is English.
   *
   * @param args
   */
  public static void main(String[] args) {
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    String encoding = "UTF-8";

    String guessFile = null;
    String goldFile = null;

    Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);

    for(Map.Entry<String, String[]> opt : argsMap.entrySet()) {
      if(opt.getKey() == null) continue;
      if(opt.getKey().equals("-l")) {
        Language lang = Language.valueOf(opt.getValue()[0].trim());
        tlpp = Languages.getLanguageParams(lang);

      } else if(opt.getKey().equals("-y")) {
        maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());

      } else if(opt.getKey().equals("-v")) {
        VERBOSE = true;

      } else if(opt.getKey().equals("-e")) {
        encoding = opt.getValue()[0];

      } else {
        System.err.println(usage.toString());
        System.exit(-1);
      }

      //Non-option arguments located at key null
      String[] rest = argsMap.get(null);
      if(rest == null || rest.length < minArgs) {
        System.err.println(usage.toString());
        System.exit(-1);
      }
      goldFile = rest[0];
      guessFile = rest[1];
    }

    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();

    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());

    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());

    final UnlabeledAttachmentEval metric = new UnlabeledAttachmentEval("UAS LP/LR", true, tlpp.headFinder());

    final TreeTransformer tc = tlpp.collinizer();

    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
View Full Code Here

    // the intermediate GrammaticalStructure instead
    SemanticGraph graph = SemanticGraphFactory.generateUncollapsedDependencies(tree);

    // Alternatively, this could have been the Chinese params or any
    // other language supported.  As of 2014, only English and Chinese
    TreebankLangParserParams params = new EnglishTreebankParserParams();
    GrammaticalStructureFactory gsf = params.treebankLanguagePack().grammaticalStructureFactory(params.treebankLanguagePack().punctuationWordRejectFilter(), params.typedDependencyHeadFinder());

    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);

    System.err.println(graph);
View Full Code Here

    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";

    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }

    Counter<String> vocab = new ClassicCounter<String>();
    for(Tree t : tb) {
      List<Label> yield = t.yield();
      for(Label word : yield)
        vocab.incrementCount(word.value());
    }

    List<String> biggestKeys = new ArrayList<String>(vocab.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(vocab));

    PrintWriter pw = tlpp.pw();
    for(String wordType : biggestKeys)
      pw.printf("%s\t%d%n", wordType,(int) vocab.getCount(wordType));
    pw.close();
  }
View Full Code Here

    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    String puncTag = null;

    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        puncTag = args[i++];
       
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }

    Counter<String> puncTypes = new ClassicCounter<String>();
    for(Tree t : tb) {
      List<CoreLabel> yield = t.taggedLabeledYield();
      for(CoreLabel word : yield)
        if(word.tag().equals(puncTag))
          puncTypes.incrementCount(word.word());
    }

    List<String> biggestKeys = new ArrayList<String>(puncTypes.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(puncTypes));

    PrintWriter pw = tlpp.pw();
    for(String wordType : biggestKeys)
      pw.printf("%s\t%d%n", wordType,(int) puncTypes.getCount(wordType));
    pw.close();
  }
View Full Code Here

    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    String encoding = "UTF-8";

    String guessFile = null;
    String goldFile = null;

    Map<String, String[]> argsMap = StringUtils.argsToMap(args, optionArgDefs);

    for(Map.Entry<String, String[]> opt : argsMap.entrySet()) {
      if(opt.getKey() == null) continue;
      if(opt.getKey().equals("-l")) {
        Language lang = Language.valueOf(opt.getValue()[0].trim());
        tlpp = Languages.getLanguageParams(lang);

      } else if(opt.getKey().equals("-y")) {
        maxGoldYield = Integer.parseInt(opt.getValue()[0].trim());

      } else if(opt.getKey().equals("-v")) {
        VERBOSE = true;

      } else if(opt.getKey().equals("-c")) {
        TaggingEval.doCatLevelEval = true;

      } else if(opt.getKey().equals("-e")) {
        encoding = opt.getValue()[0];

      } else {
        System.err.println(usage.toString());
        System.exit(-1);
      }

      //Non-option arguments located at key null
      String[] rest = argsMap.get(null);
      if(rest == null || rest.length < minArgs) {
        System.err.println(usage.toString());
        System.exit(-1);
      }
      goldFile = rest[0];
      guessFile = rest[1];
    }

    tlpp.setInputEncoding(encoding);
    final PrintWriter pwOut = tlpp.pw();

    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());

    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());

    final TaggingEval metric = new TaggingEval("Tagging LP/LR");

    final TreeTransformer tc = tlpp.collinizer();

    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch
View Full Code Here

    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
   
    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        rootMatch = TregexPattern.compile("@" + args[i++]);

        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i++]);
      }
    }

    Counter<String> rhsCounter = new ClassicCounter<String>();
    for(Tree t : tb) {
      TregexMatcher m = rootMatch.matcher(t);
      while(m.findNextMatchingNode()) {
        Tree match = m.getMatch();
        StringBuilder sb = new StringBuilder();
        for(Tree kid : match.children())
          sb.append(kid.value()).append(" ");
        rhsCounter.incrementCount(sb.toString().trim());
      }
    }

    List<String> biggestKeys = new ArrayList<String>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));

    PrintWriter pw = tlpp.pw();
    for(String rhs : biggestKeys)
      pw.printf("%s\t%d%n", rhs,(int) rhsCounter.getCount(rhs));
    pw.close();
  }
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams$EnglishSubcategoryStripper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.