Package edu.stanford.nlp.trees

Examples of edu.stanford.nlp.trees.DiskTreebank


    trf = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf;
    transformer = record.treeTransformer;
    normalizer = record.treeNormalizer;
    treeFilter = record.treeFilter;

    treebank = new DiskTreebank(trf, record.encoding);
    if (record.treeRange != null) {
      treebank.loadPath(filename, record.treeRange);
    } else {
      treebank.loadPath(filename);
    }
View Full Code Here


    return index;
  }

  public static void main(String[] args) {
    // simple testing code
    Treebank treebank = new DiskTreebank();
    CategoryWordTag.suppressTerminalDetails = true;
    treebank.loadPath(args[0]);
    final HeadFinder chf = new NoPunctuationHeadFinder();
    treebank.apply(pt -> {
      pt.percolateHeads(chf);
      pt.pennPrint();
      System.out.println();
    });
  }
View Full Code Here

    return new MemoryTreebank(treeReaderFactory(), inputEncoding);
  }

  @Override
  public DiskTreebank diskTreebank() {
    return new DiskTreebank(treeReaderFactory(), inputEncoding);
  }
View Full Code Here

    return new TreeCollinizer(treebankLanguagePack());
  }

  @Override
  public DiskTreebank diskTreebank() {
   return new DiskTreebank(treeReaderFactory(), inputEncoding);
  }
View Full Code Here

      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    Language lang = Language.English;

    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }

    PrintWriter pw = tlpp.pw();
    Options op = new Options();
    Options.LexOptions lexOptions = op.lexOptions;
    if(lang == Language.French) {
      lexOptions.useUnknownWordSignatures = 1;
      lexOptions.smartMutation = false;
      lexOptions.unknownSuffixSize = 2;
      lexOptions.unknownPrefixSize = 1;
    } else if(lang == Language.Arabic) {
      lexOptions.smartMutation = false;
      lexOptions.useUnknownWordSignatures = 9;
      lexOptions.unknownPrefixSize = 1;
      lexOptions.unknownSuffixSize = 1;
    }
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
   
    int computeAfter = (int) (0.50 * tb.size());
    Counter<String> vocab = new ClassicCounter<String>();
    Counter<String> unkCounter = new ClassicCounter<String>();
    int treeId = 0;
    for(Tree t : tb) {
      List<Label> yield = t.yield();
View Full Code Here

    if (args.length < 3) {
      System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*");
      return;
    }
    System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
    Treebank tb = new DiskTreebank();
    tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
    // TODO: change this interface so the lexicon creates its own indices?
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    Options op = new Options();
    op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
    BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
    lex.initializeTraining(tb.size());
    lex.train(tb);
    lex.finishTraining();
    System.out.println("done.");
    System.out.println();
    NumberFormat nf = NumberFormat.getNumberInstance();
View Full Code Here

    TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);

    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);

    // Statistics
    Counter<String> binaryRuleTypes = new ClassicCounter<String>(20000);
    List<Integer> branchingFactors = new ArrayList<Integer>(20000);
    int nTrees = 0;
View Full Code Here

      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";

    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }

    Counter<String> vocab = new ClassicCounter<String>();
    for(Tree t : tb) {
View Full Code Here

      System.out.println(usage.toString());
      System.exit(-1);
    }

    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    String puncTag = null;

    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);

            break;
          case "-e":
            encoding = args[++i];

            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }

      } else {
        puncTag = args[i++];
       
        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i]);
      }
    }

    Counter<String> puncTypes = new ClassicCounter<String>();
    for(Tree t : tb) {
View Full Code Here

    TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
   
    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);
   
    // Read the treebank
    PrintWriter pw = tlpp.pw();
    int numTrees = 0;
    for (Tree tree : tb) {
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.trees.DiskTreebank

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.