if(useSplit) {
List<ObservedCorpusStats> allSplitStats = new ArrayList<ObservedCorpusStats>();
makeVocab = true;
for(Map.Entry<Split, Set<String>> split : splitFileLists.entrySet()) {
DiskTreebank tb = tlpp.diskTreebank();
FileFilter splitFilter = new SplitFilter(split.getValue());
for(String path : pathNames)
tb.loadPath(path, splitFilter);
ObservedCorpusStats splitStats = gatherStats(tb,languageName.toString() + "." + split.getKey().toString());
allSplitStats.add(splitStats);
makeVocab = false;
}
display(aggregateStats(allSplitStats), displayWords, displayOOV);
for(ObservedCorpusStats ocs : allSplitStats)
display(ocs, displayWords, displayOOV);
} else if(pathsAreFiles) {
makeVocab = true;
for(String path : pathNames) {
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(path, pathname -> true);
ObservedCorpusStats stats = gatherStats(tb, languageName.toString() + " " + path.toString());
display(stats, displayWords, displayOOV);
makeVocab = false;
}
} else {
trainVocab = Generics.newHashSet();
DiskTreebank tb = tlpp.diskTreebank();
for(String path : pathNames)
tb.loadPath(path, pathname -> !pathname.isDirectory());
ObservedCorpusStats allStats = gatherStats(tb, languageName.toString());
display(allStats, displayWords, displayOOV);
}
}