Package edu.stanford.nlp.trees

Examples of edu.stanford.nlp.trees.TreebankLanguagePack


   *
   * @param tlpParams The treebank parser params
   * @return A suitable tree printing object
   */
  public TreePrint treePrint(TreebankLangParserParams tlpParams) {
    TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
    return new TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.headFinder(), tlpParams.typedDependencyHeadFinder());
  }
View Full Code Here


   * @param in Reader
   * @param tf TreeFactory -- factory to create some kind of Tree
   * @param tn the method of normalizing trees
   */
  public FrenchXMLTreeReader(Reader in, TreeFactory tf, TreeNormalizer tn) {
    TreebankLanguagePack tlp = new FrenchTreebankLanguagePack();
    stream = new ReaderInputStream(in,tlp.getEncoding());
    treeFactory = tf;
    treeNormalizer = tn;

    DocumentBuilder parser = XMLUtils.getXmlParser();
    try {
View Full Code Here

    }
  }

  public List<Tree> getAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank) {
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();

    if (VERBOSE) System.err.println("\n\n" + trainTreebank.textualSummary(tlp));

    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
View Full Code Here

        else if (text.equals(".")) assertEquals(6, index);
      } else {
        // System.err.println(leaf + " is not a CoreLabel.");
      }
    }
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> deps = gs.typedDependenciesCCprocessed(true);
    // System.out.println(deps);
View Full Code Here

    synchronized(LexicalizedParserITest.class) {
      if (englishParser == null) {
        // sharing a bunch of code here with the webapp in
        // parser/webapp/index.jsp...  perhaps we could reuse that code
        englishParser = LexicalizedParser.loadModel();
        TreebankLanguagePack tLP =
          englishParser.getOp().tlpParams.treebankLanguagePack();
        tagPrint = new TreePrint("wordsAndTags", tLP);
        pennPrint = new TreePrint("penn", tLP);
        typDepPrint = new TreePrint("typedDependencies", "basicDependencies", tLP);
        typDepColPrint = new TreePrint("typedDependencies", tLP)// default is now CCprocessed
View Full Code Here

  public double testOnTreebank(Treebank testTreebank) {
    System.err.println("Testing on treebank");
    Timing treebankTotalTimer = new Timing();
    TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = op.langpack();
    PrintWriter pwOut, pwErr;
    if (op.testOptions.quietEvaluation) {
      NullOutputStream quiet = new NullOutputStream();
      pwOut = tlpParams.pw(quiet);
      pwErr = tlpParams.pw(quiet);
View Full Code Here

      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    op.testOptions.display();
    op.trainOptions.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");

    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
      binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    } else {
      binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    }

    CollinsPuncTransformer collinsPuncTransformer = null;
    if (op.trainOptions.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
      if (op.trainOptions.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : op.trainOptions.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) ||
              elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (op.trainOptions.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack());
    }

    if (op.trainOptions.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (op.trainOptions.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        //tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        //binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (op.testOptions.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done.")// binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    Index<String> stateIndex = new HashIndex<String>();

    // extract grammars
    Extractor<Pair<UnaryGrammar,BinaryGrammar>> bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
    //Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    //Extractor dgExtractor = new DependencyMemGrammarExtractor();

    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair<UnaryGrammar, BinaryGrammar> bgug = null;
      if (op.trainOptions.cheatPCFG) {
        List<Tree> allTrees = new ArrayList<Tree>(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = bgExtractor.extract(allTrees);
      } else {
        bgug = bgExtractor.extract(binaryTrainTrees);
      }
      bg = bgug.second;
      bg.splitRules();
      ug = bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    lex = op.tlpParams.lex(op, wordIndex, tagIndex);
    lex.initializeTraining(binaryTrainTrees.size());
    lex.train(binaryTrainTrees);
    lex.finishTraining();
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      Extractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true));

      // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
      //dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams));

      //dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      dg = dgExtractor.extract(binaryTrainTrees); //uses information whether the words are known or not, discards unknown words
      Timing.tick("done.");
      //System.out.print("Extracting Unknown Word Model...");
      //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      //Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      //System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
      parser.saveParserToSerialized(serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
    }


    ExhaustiveDependencyParser dparser = ((op.doDep && ! op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null);

    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null);
    //Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex);
    }

    Evalb pcfgPE = new Evalb("pcfg  PE", true);
    Evalb comboPE = new Evalb("combo PE", true);
    AbstractEval pcfgCB = new Evalb.CBEval("pcfg  CB", true);

    AbstractEval pcfgTE = new TaggingEval("pcfg  TE");
    AbstractEval comboTE = new TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
    AbstractEval depTE = new TaggingEval("depnd TE");

    AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter());
    AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter());

    if (op.testOptions.evalb) {
      EvalbFormatWriter.initEVALBfiles(op.tlpParams);
    }
View Full Code Here

        "nicht", ",", "also", "muß", "sie", "sie", "sehen", ";", "und", "die", "sehe", "man", "einmal", "in", "einem",
        "Paar", "spitzen", "Schultern", ",", "zylindrischen", "Schenkeln", ",", "oder", "leeren", "Ärmeln", ",",
        "oder", "lattenförmigen", "Beinen", "."

    };
    TreebankLanguagePack tlp = new NegraPennLanguagePack();
    Tokenizer<? extends HasWord> toke =tlp.getTokenizerFactory().getTokenizer(new StringReader(sample));
    List<? extends HasWord> tokens = toke.tokenize();
    List<? extends HasWord> goldTokens = Sentence.toWordList(tokenized);
    assertEquals("Tokenization length mismatch", goldTokens.size(), tokens.size());
    for (int i = 0, sz = goldTokens.size(); i < sz; i++) {
      assertEquals("Bad tokenization", goldTokens.get(i).word(), tokens.get(i).word());
View Full Code Here

    if (argMap.containsKey("-encoding")) {
      charset = argMap.get("-encoding")[0];
    }
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));

    TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
    String[] dpString = argMap.get("-dictPath");
    ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
    int totalWords = 0, coveredWords = 0;

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, charset), true);

    for (String line = r.readLine(); line != null; line = r.readLine()) {
      String[] words = line.split("\\s", 1000);
      for (String word : words) {
        totalWords++;
        if (word.length() == 0) continue;
        pw.print(StringUtils.pad(word + ':', 8));
        if (tlp.isPunctuationWord(word)) {
          totalWords--;
          pw.print(word);
  } else if (isDigits(word)) {
    pw.print(word + " [NUMBER]");
        } else if (cewm.containsKey(word)) {
View Full Code Here

    if (argMap.containsKey("-encoding")) {
      charset = argMap.get("-encoding")[0];
    }
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));

    TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
    String[] dpString = argMap.get("-dictPath");
    ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
    int totalWords = 0, coveredWords = 0;

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, charset), true);

    for (String line = r.readLine(); line != null; line = r.readLine()) {
      String[] words = line.split("\\s", 1000);
      for (String word : words) {
        totalWords++;
        if (word.length() == 0) continue;
        pw.print(StringUtils.pad(word + ':', 8));
        if (tlp.isPunctuationWord(word)) {
          totalWords--;
          pw.print(word);
  } else if (isDigits(word)) {
    pw.print(word + " [NUMBER]");
        } else if (cewm.containsKey(word)) {
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.trees.TreebankLanguagePack

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.