Examples of edu.stanford.nlp.trees.TreebankLanguagePack

edu.stanford.nlp.trees.TreebankLanguagePack
This interface specifies language/treebank specific information for a Treebank, which a parser or other treebank user might need to know.

Some of this is fixed for a (treebank,language) pair, but some of it reflects feature extraction decisions, so it can be sensible to have multiple implementations of this interface for the same (treebank,language) pair.

So far this covers punctuation, character encodings, and characters reserved for label annotations. It should probably be expanded to cover other stuff (unknown words?).

Various methods in this class return arrays. You should treat them as read-only, even though one cannot enforce that in Java.

Implementations in this class do not call basicCategory() on arguments before testing them, so if needed, you should explicitly call basicCategory() yourself before passing arguments to these routines for testing.
This class should be able to be an immutable singleton. It contains data on various things, but no state. At some point we should make it a real immutable singleton. @author Christopher Manning @version 1.1, Mar 2003

   *
   * @param tlpParams The treebank parser params
   * @return A suitable tree printing object
   */
  public TreePrint treePrint(TreebankLangParserParams tlpParams) {
    TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
    return new TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.headFinder(), tlpParams.typedDependencyHeadFinder());
  }

View Full Code Here

   * @param in Reader
   * @param tf TreeFactory -- factory to create some kind of Tree
   * @param tn the method of normalizing trees
   */
  public FrenchXMLTreeReader(Reader in, TreeFactory tf, TreeNormalizer tn) {
    TreebankLanguagePack tlp = new FrenchTreebankLanguagePack();
    stream = new ReaderInputStream(in,tlp.getEncoding());
    treeFactory = tf;
    treeNormalizer = tn;


    DocumentBuilder parser = XMLUtils.getXmlParser();
    try {

View Full Code Here

    }
  }


  public List<Tree> getAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank) {
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();


    if (VERBOSE) System.err.println("\n\n" + trainTreebank.textualSummary(tlp));


    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);

View Full Code Here

        else if (text.equals(".")) assertEquals(6, index);
      } else {
        // System.err.println(leaf + " is not a CoreLabel.");
      }
    }
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();


    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> deps = gs.typedDependenciesCCprocessed(true);
    // System.out.println(deps);

View Full Code Here

    synchronized(LexicalizedParserITest.class) {
      if (englishParser == null) {
        // sharing a bunch of code here with the webapp in
        // parser/webapp/index.jsp...  perhaps we could reuse that code
        englishParser = LexicalizedParser.loadModel();
        TreebankLanguagePack tLP =
          englishParser.getOp().tlpParams.treebankLanguagePack();
        tagPrint = new TreePrint("wordsAndTags", tLP);
        pennPrint = new TreePrint("penn", tLP);
        typDepPrint = new TreePrint("typedDependencies", "basicDependencies", tLP);
        typDepColPrint = new TreePrint("typedDependencies", tLP);  // default is now CCprocessed

View Full Code Here

  public double testOnTreebank(Treebank testTreebank) {
    System.err.println("Testing on treebank");
    Timing treebankTotalTimer = new Timing();
    TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = op.langpack();
    PrintWriter pwOut, pwErr;
    if (op.testOptions.quietEvaluation) {
      NullOutputStream quiet = new NullOutputStream();
      pwOut = tlpParams.pw(quiet);
      pwErr = tlpParams.pw(quiet);

View Full Code Here

      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();


    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();


    op.testOptions.display();
    op.trainOptions.display();
    op.display();
    op.tlpParams.display();


    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);


    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }


    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");


    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
      binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    } else {
      binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    }


    CollinsPuncTransformer collinsPuncTransformer = null;
    if (op.trainOptions.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();


    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
      if (op.trainOptions.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : op.trainOptions.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) ||
              elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (op.trainOptions.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack());
    }


    if (op.trainOptions.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (op.trainOptions.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        //tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        //binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (op.testOptions.verbose) {
      binarizer.dumpStats();
    }


    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done.");  // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    Index<String> stateIndex = new HashIndex<String>();


    // extract grammars
    Extractor<Pair<UnaryGrammar,BinaryGrammar>> bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
    //Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();


    //Extractor dgExtractor = new DependencyMemGrammarExtractor();


    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair<UnaryGrammar, BinaryGrammar> bgug = null;
      if (op.trainOptions.cheatPCFG) {
        List<Tree> allTrees = new ArrayList<Tree>(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = bgExtractor.extract(allTrees);
      } else {
        bgug = bgExtractor.extract(binaryTrainTrees);
      }
      bg = bgug.second;
      bg.splitRules();
      ug = bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    lex = op.tlpParams.lex(op, wordIndex, tagIndex);
    lex.initializeTraining(binaryTrainTrees.size());
    lex.train(binaryTrainTrees);
    lex.finishTraining();
    Timing.tick("done.");


    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      Extractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true));


      // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
      //dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams));


      //dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      dg = dgExtractor.extract(binaryTrainTrees); //uses information whether the words are known or not, discards unknown words
      Timing.tick("done.");
      //System.out.print("Extracting Unknown Word Model...");
      //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      //Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      //System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }


    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;


    GrammarProjection gp = new NullGrammarProjection(bg, ug);


    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
      parser.saveParserToSerialized(serializeFile);
      Timing.tick("done.");
    }


    // test: pcfg-parse and output


    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
    }




    ExhaustiveDependencyParser dparser = ((op.doDep && ! op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null);


    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null);
    //Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex);
    }


    Evalb pcfgPE = new Evalb("pcfg  PE", true);
    Evalb comboPE = new Evalb("combo PE", true);
    AbstractEval pcfgCB = new Evalb.CBEval("pcfg  CB", true);


    AbstractEval pcfgTE = new TaggingEval("pcfg  TE");
    AbstractEval comboTE = new TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
    AbstractEval depTE = new TaggingEval("depnd TE");


    AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter());
    AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter());


    if (op.testOptions.evalb) {
      EvalbFormatWriter.initEVALBfiles(op.tlpParams);
    }

View Full Code Here

        "nicht", ",", "also", "muß", "sie", "sie", "sehen", ";", "und", "die", "sehe", "man", "einmal", "in", "einem",
        "Paar", "spitzen", "Schultern", ",", "zylindrischen", "Schenkeln", ",", "oder", "leeren", "Ärmeln", ",",
        "oder", "lattenförmigen", "Beinen", "."


    };
    TreebankLanguagePack tlp = new NegraPennLanguagePack();
    Tokenizer<? extends HasWord> toke =tlp.getTokenizerFactory().getTokenizer(new StringReader(sample));
    List<? extends HasWord> tokens = toke.tokenize();
    List<? extends HasWord> goldTokens = Sentence.toWordList(tokenized);
    assertEquals("Tokenization length mismatch", goldTokens.size(), tokens.size());
    for (int i = 0, sz = goldTokens.size(); i < sz; i++) {
      assertEquals("Bad tokenization", goldTokens.get(i).word(), tokens.get(i).word());

View Full Code Here

    if (argMap.containsKey("-encoding")) {
      charset = argMap.get("-encoding")[0];
    }
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));


    TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
    String[] dpString = argMap.get("-dictPath");
    ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
    int totalWords = 0, coveredWords = 0;


    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, charset), true);


    for (String line = r.readLine(); line != null; line = r.readLine()) {
      String[] words = line.split("\\s", 1000);
      for (String word : words) {
        totalWords++;
        if (word.length() == 0) continue;
        pw.print(StringUtils.pad(word + ':', 8));
        if (tlp.isPunctuationWord(word)) {
          totalWords--;
          pw.print(word);
  } else if (isDigits(word)) {
    pw.print(word + " [NUMBER]");
        } else if (cewm.containsKey(word)) {

View Full Code Here

    if (argMap.containsKey("-encoding")) {
      charset = argMap.get("-encoding")[0];
    }
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));


    TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
    String[] dpString = argMap.get("-dictPath");
    ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
    int totalWords = 0, coveredWords = 0;


    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, charset), true);


    for (String line = r.readLine(); line != null; line = r.readLine()) {
      String[] words = line.split("\\s", 1000);
      for (String word : words) {
        totalWords++;
        if (word.length() == 0) continue;
        pw.print(StringUtils.pad(word + ':', 8));
        if (tlp.isPunctuationWord(word)) {
          totalWords--;
          pw.print(word);
  } else if (isDigits(word)) {
    pw.print(word + " [NUMBER]");
        } else if (cewm.containsKey(word)) {

View Full Code Here

0 1

TOP

Related Classes of edu.stanford.nlp.trees.TreebankLanguagePack

edu.stanford.nlp.international.arabic.parsesegment.JointParsingModel

edu.stanford.nlp.parser.DependencyIndexITest

edu.stanford.nlp.parser.lexparser.ChineseMaxentLexicon

edu.stanford.nlp.parser.lexparser.EvaluateTreebank

edu.stanford.nlp.parser.lexparser.FactoredParser

edu.stanford.nlp.parser.lexparser.LexicalizedParserITest

edu.stanford.nlp.parser.lexparser.LexicalizedParserQuery

edu.stanford.nlp.parser.lexparser.TestOptions

edu.stanford.nlp.pipeline.ParserAnnotator

edu.stanford.nlp.process.PTBTokenizerTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.