Examples of edu.stanford.nlp.trees.DiskTreebank

edu.stanford.nlp.trees.DiskTreebank
A DiskTreebank is a Collection of Trees. A DiskTreebank object stores merely the information to get at a corpus of trees that is stored on disk. Access is usually via apply()'ing a TreeVisitor to each Tree in the Treebank or by using an iterator() to get an iteration over the Trees.
If the root Label of the Tree objects built by the TreeReader implements HasIndex, then the filename and index of the tree in a corpus will be inserted as they are read in. @author Christopher Manning @author Spence Green

    }


    Properties options = StringUtils.argsToProperties(args, argDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = Languages.getLanguageParams(language);
    DiskTreebank tb = null;
    String encoding = options.getProperty("l", "UTF-8");
    boolean removeBracket = PropertiesUtils.getBool(options, "b", false);
    
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    tb = tlpp.diskTreebank();


    String[] files = options.getProperty("", "").split("\\s+");
    if (files.length != 0) {
      for (String filename : files) {
        tb.loadPath(filename);
      }
    } else {
      System.err.println(usage());
      System.exit(-1);
    }

View Full Code Here


  public ATBArabicDataset() {
    super();


    //Read the raw file as UTF-8 irrespective of output encoding
    treebank = new DiskTreebank(new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true), "UTF-8");
  }

View Full Code Here

      System.out.println(usage.toString());
      System.exit(-1);
    }


    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
    
    for(int i = 0; i < args.length; i++) {
      if(args[i].startsWith("-")) {
        switch (args[i]) {
          case "-l":
            Language lang = Language.valueOf(args[++i].trim());
            tlpp = Languages.getLanguageParams(lang);


            break;
          case "-e":
            encoding = args[++i];


            break;
          default:
            System.out.println(usage.toString());
            System.exit(-1);
        }


      } else {
        rootMatch = TregexPattern.compile("@" + args[i++]);


        if(tb == null) {
          if(tlpp == null) {
            System.out.println(usage.toString());
            System.exit(-1);
          } else {
            tlpp.setInputEncoding(encoding);
            tlpp.setOutputEncoding(encoding);
            tb = tlpp.diskTreebank();
          }
        }
        tb.loadPath(args[i++]);
      }
    }


    Counter<String> rhsCounter = new ClassicCounter<String>();
    for(Tree t : tb) {

View Full Code Here

  }


  /* Returns a DiskTreebank with a NegraPennTokenizer and a
   * NegraPennTreeNormalizer */
  public DiskTreebank diskTreebank() {
    return new DiskTreebank(treeReaderFactory(), inputEncoding);
  }

View Full Code Here

   */
  public void addFileFolder(final EnumMap<FilterType, String> filters, final File[] files) {
    List<FileTreeNode> newFiles = new ArrayList<FileTreeNode>();
    findLoadableFiles(filters, files, newFiles, FileTreeModel.this.getRoot());//findLoadableFiles updates newFiles
    for(FileTreeNode fileNode : newFiles) {
      Treebank treebank = new DiskTreebank(trf, curEncoding);
      treebank.loadPath(fileNode.getFile(), null, true);
      TreeTransformer transformer = TregexGUI.getInstance().transformer;
      if (transformer != null) {
        treebank = new TransformingTreebank(treebank, transformer);
      }
      fileNode.setTreebank(treebank);

View Full Code Here


    if(useSplit) {
      List<ObservedCorpusStats> allSplitStats = new ArrayList<ObservedCorpusStats>();
      makeVocab = true;
      for(Map.Entry<Split, Set<String>> split : splitFileLists.entrySet()) {
        DiskTreebank tb = tlpp.diskTreebank();
        FileFilter splitFilter = new SplitFilter(split.getValue());
        for(String path : pathNames)
          tb.loadPath(path, splitFilter);
        ObservedCorpusStats splitStats = gatherStats(tb,languageName.toString() + "." + split.getKey().toString());
        allSplitStats.add(splitStats);
        makeVocab = false;
      }


      display(aggregateStats(allSplitStats), displayWords, displayOOV);
      for(ObservedCorpusStats ocs : allSplitStats)
        display(ocs, displayWords, displayOOV);


    } else if(pathsAreFiles) {
      makeVocab = true;
      for(String path : pathNames) {
        DiskTreebank tb = tlpp.diskTreebank();
        tb.loadPath(path, pathname -> true);


        ObservedCorpusStats stats = gatherStats(tb, languageName.toString() + "  " + path.toString());
        display(stats, displayWords, displayOOV);
        makeVocab = false;
      }


    } else {
      trainVocab = Generics.newHashSet();
      DiskTreebank tb = tlpp.diskTreebank();
      for(String path : pathNames)
        tb.loadPath(path, pathname -> !pathname.isDirectory());


      ObservedCorpusStats allStats = gatherStats(tb, languageName.toString());
      display(allStats, displayWords, displayOOV);
    }
  }

View Full Code Here

    return new TreeCollinizer(treebankLanguagePack(),collinizerRetainsPunctuation,false);
  }


  @Override
  public DiskTreebank diskTreebank() {
    return new DiskTreebank(treeReaderFactory(), inputEncoding);
  }

View Full Code Here

      System.err.println("usage: Relation treebank numberRanges");
      return;
    }
    FileFilter testFilt = new NumberRangesFileFilter(args[1], true);
    TreeReaderFactory trf = new PennTreeReaderFactory(new NPTmpRetainingTreeNormalizer());
    DiskTreebank testTreebank = new DiskTreebank(trf);
    testTreebank.loadPath(new File(args[0]), testFilt);
    HeadFinder hf = new ModCollinsHeadFinder();
    List<Relation> relations = new ArrayList<Relation>();
    relations.addAll(Arrays.asList(SIMPLE_RELATIONS));
    relations.add(new HasIthChild(2));
    relations.add(new HasIthChild(-1));

View Full Code Here

0 1

TOP

Related Classes of edu.stanford.nlp.trees.DiskTreebank

edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder

edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset

edu.stanford.nlp.io.FileSequentialCollection

edu.stanford.nlp.ling.HasIndex

edu.stanford.nlp.parser.eval.TreebankStats

edu.stanford.nlp.parser.eval.UNKPrinter

edu.stanford.nlp.parser.lexparser.BaseLexicon

edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams

edu.stanford.nlp.parser.lexparser.HebrewTreebankParserParams

edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.