Examples of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams

Package edu.stanford.nlp.parser.lexparser

Examples of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams

edu.stanford.nlp.parser.lexparser.TreebankLangParserParams
Contains language-specific methods necessary to get the parser to parse an arbitrary treebank. @author Roger Levy @version 03/05/2003

  final TreeBinarizer binarizer;
  final String tlppClass;


  public BinarizerAnnotator(String annotatorName, Properties props) {
    this.tlppClass = props.getProperty(annotatorName + ".tlppClass", DEFAULT_TLPP_CLASS);
    TreebankLangParserParams tlpp = ReflectionLoading.loadByReflection(tlppClass);
    this.binarizer = TreeBinarizer.simpleTreeBinarizer(tlpp.headFinder(), tlpp.treebankLanguagePack());
  }

View Full Code Here

      System.exit(-1);
    }
    File goldFile = new File(parsedArgs[0]);
    File guessFile = new File(parsedArgs[1]);
    
    final TreebankLangParserParams tlpp = Languages.getLanguageParams(LANGUAGE);
    final PrintWriter pwOut = tlpp.pw();


    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());


    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());


    final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());


    final TreeTransformer tc = tlpp.collinizer();


    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:

View Full Code Here

    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }


    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    int maxGoldYield = Integer.MAX_VALUE;
    int maxGuessYield = Integer.MAX_VALUE;
    boolean VERBOSE = false;
    boolean skipGuess = false;
    boolean tagMode = false;
    String guessFile = null;
    String goldFile = null;


    for(int i = 0; i < args.length; i++) {


      if(args[i].startsWith("-")) {


        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);


            break;
          case "-y":
            maxGoldYield = Integer.parseInt(args[++i].trim());


            break;
          case "-t":
            tagMode = true;


            break;
          case "-v":
            VERBOSE = true;


            break;
          case "-g":
            maxGuessYield = Integer.parseInt(args[++i].trim());
            skipGuess = true;


            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }


      } else {
        //Required parameters
        goldFile = args[i++];
        guessFile = args[i];
        break;
      }
    }


    final PrintWriter pwOut = tlpp.pw();


    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());


    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());


    final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
    final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);


    final TreeTransformer tc = tlpp.collinizer();


    //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
    //don't match, we need to keep looking for the next gold tree that matches.
    //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
    //status as follows:

View Full Code Here

      System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
      System.exit(-1);
    }


    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
    if (language.equals(Language.Arabic)) {
      String[] options = {"-arabicFactored"};
      tlpp.setOptionFlag(options, 0);
    } else {
      String[] options = {"-frenchFactored"};
      tlpp.setOptionFlag(options, 0);
    }
    Treebank tb = tlpp.diskTreebank();
    tb.loadPath(args[1]);


    MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ?
        new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();


    String[] features = args[2].trim().split(",");
    for (String feature : features) {
      morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }


    // Counters
    Counter<String> wordTagCounter = new ClassicCounter<String>(30000);
    Counter<String> morphTagCounter = new ClassicCounter<String>(500);
//    Counter<String> signatureTagCounter = new ClassicCounter<String>();
    Counter<String> morphCounter = new ClassicCounter<String>(500);
    Counter<String> wordCounter = new ClassicCounter<String>(30000);
    Counter<String> tagCounter = new ClassicCounter<String>(300);


    Counter<String> lemmaCounter = new ClassicCounter<String>(25000);
    Counter<String> lemmaTagCounter = new ClassicCounter<String>(25000);


    Counter<String> richTagCounter = new ClassicCounter<String>(1000);


    Counter<String> reducedTagCounter = new ClassicCounter<String>(500);


    Counter<String> reducedTagLemmaCounter = new ClassicCounter<String>(500);


    Map<String,Set<String>> wordLemmaMap = Generics.newHashMap();


    TwoDimensionalIntCounter<String,String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<String,String>(30000);
    TwoDimensionalIntCounter<String,String> reducedTagTagCounter = new TwoDimensionalIntCounter<String,String>(500);
    TwoDimensionalIntCounter<String,String> tagReducedTagCounter = new TwoDimensionalIntCounter<String,String>(300);


    int numTrees = 0;
    for (Tree tree : tb) {
      for (Tree subTree : tree) {
        if (!subTree.isLeaf()) {
          tlpp.transformTree(subTree, tree);
        }
      }
      List<Label> pretermList = tree.preTerminalYield();
      List<Label> yield = tree.yield();
      assert yield.size() == pretermList.size();

View Full Code Here

    if(args.length < minArgs) {
      System.out.println(usage.toString());
      System.exit(-1);
    }


    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    Language lang = Language.English;


    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);


            break;
          case "-e":
            encoding = args[++i];


            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }


      } else {
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }


    PrintWriter pw = tlpp.pw();
    Options op = new Options();
    Options.LexOptions lexOptions = op.lexOptions;
    if(lang == Language.French) {
      lexOptions.useUnknownWordSignatures = 1;
      lexOptions.smartMutation = false;
      lexOptions.unknownSuffixSize = 2;
      lexOptions.unknownPrefixSize = 1;
    } else if(lang == Language.Arabic) {
      lexOptions.smartMutation = false;
      lexOptions.useUnknownWordSignatures = 9;
      lexOptions.unknownPrefixSize = 1;
      lexOptions.unknownSuffixSize = 1;
    }
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
    
    int computeAfter = (int) (0.50 * tb.size());
    Counter<String> vocab = new ClassicCounter<String>();
    Counter<String> unkCounter = new ClassicCounter<String>();
    int treeId = 0;

View Full Code Here

  public static TreebankLangParserParams getLanguageParams(String lang) {
    return getLanguageParams(Language.valueOf(lang));
  }


  public static TreebankLangParserParams getLanguageParams(Language lang) {
    TreebankLangParserParams tlpp; // initialized below
    switch(lang) {
    case Arabic:
      tlpp = new ArabicTreebankParserParams();
      break;

View Full Code Here

    boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null;


    // TODO: if a parser is specified, load this from the parser
    // instead of ever loading it from this way
    String tLPP = props.getProperty("tLPP", "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams");
    TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP);
    if (makeCopulaHead) {
      // TODO: generalize and allow for more options
      String[] options = { "-makeCopulaHead" };
      params.setOptionFlag(options, 0);
    }


    if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) {
      try {
        System.err.println("Usage: java GrammaticalStructure [options]* [-sentFile|-treeFile|-conllxFile file] [-testGraph]");
        System.err.println("  options: -basic, -collapsed, -CCprocessed [the default], -collapsedTree, -parseTree, -test, -parserFile file, -conllx, -keepPunct, -altprinter -altreader -altreaderfile");
        TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))"));
        tb.add(tr.readTree());
      } catch (Exception e) {
        System.err.println("Horrible error: " + e);
        e.printStackTrace();
      }
    } else if (altDepReaderName != null && altDepReaderFilename != null) {
      DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName);
      try {
        gsBank = altDepReader.readDependencies(altDepReaderFilename);
      } catch (IOException e) {
        System.err.println("Error reading " + altDepReaderFilename);
        return;
      }
    } else if (treeFileName != null) {
      tb.loadPath(treeFileName);
    } else if (filter != null) {
      tb.load(new BufferedReader(new InputStreamReader(System.in)));
    } else if (conllXFileName != null) {
      try {
        gsBank = params.readGrammaticalStructureFromFile(conllXFileName);
      } catch (RuntimeIOException e) {
        System.err.println("Error reading " + conllXFileName);
        return;
      }
    } else {

View Full Code Here

    if (fileName == null || fileName.equals("")) {
      System.out.println(usage);
      System.exit(-1);
    }
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);


    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);


    // Statistics
    Counter<String> binaryRuleTypes = new ClassicCounter<String>(20000);
    List<Integer> branchingFactors = new ArrayList<Integer>(20000);
    int nTrees = 0;
    int nUnaryRules = 0;
    int nBinaryRules = 0;
    int binaryBranchingFactors = 0;


    // Read the treebank
    PrintWriter pw = tlpp.pw();
    for (Tree tree : tb) {
      if (tree.value().equals("ROOT")) {
        tree = tree.firstChild();
      }
      ++nTrees;

View Full Code Here

    // the intermediate GrammaticalStructure instead
    SemanticGraph graph = SemanticGraphFactory.generateUncollapsedDependencies(tree);


    // Alternatively, this could have been the Chinese params or any
    // other language supported.  As of 2014, only English and Chinese
    TreebankLangParserParams params = new EnglishTreebankParserParams();
    GrammaticalStructureFactory gsf = params.treebankLanguagePack().grammaticalStructureFactory(params.treebankLanguagePack().punctuationWordRejectFilter(), params.typedDependencyHeadFinder());


    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);


    System.err.println(graph);

View Full Code Here

    if(!validateCommandLine(args)) {
      System.err.println(USAGE);
      System.exit(-1);
    }


    final TreebankLangParserParams tlpp = Languages.getLanguageParams(LANGUAGE);
    final PrintWriter pwOut = tlpp.pw();


    final Treebank guessTreebank = tlpp.diskTreebank();
    guessTreebank.loadPath(guessFile);
    pwOut.println("GUESS TREEBANK:");
    pwOut.println(guessTreebank.textualSummary());


    final Treebank goldTreebank = tlpp.diskTreebank();
    goldTreebank.loadPath(goldFile);
    pwOut.println("GOLD TREEBANK:");
    pwOut.println(goldTreebank.textualSummary());


    final LeafAncestorEval metric = new LeafAncestorEval("LeafAncestor");


    final TreeTransformer tc = tlpp.collinizer();


    //The evalb ref implementation assigns status for each tree pair as follows:
    //
    //   0 - Ok (yields match)
    //   1 - length mismatch

View Full Code Here

0 1

TOP

Related Classes of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams

edu.stanford.nlp.international.Languages

edu.stanford.nlp.parser.eval.TreebankFactoredLexiconStats

edu.stanford.nlp.parser.eval.TreebankStats

edu.stanford.nlp.parser.eval.UNKPrinter

edu.stanford.nlp.parser.metrics.CollinsDepEval

edu.stanford.nlp.parser.metrics.Evalb

edu.stanford.nlp.parser.metrics.LeafAncestorEval

edu.stanford.nlp.parser.metrics.TaggingEval

edu.stanford.nlp.parser.metrics.TsarfatyEval

edu.stanford.nlp.parser.metrics.UnlabeledAttachmentEval

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.