private Set<String> getLowerSimplifiedFormsByDocumentFrequencies(JointReader reader, String[] filenames, int devId, int cutoff, int maxCount)
{
int i, j, len, count = 0, size = filenames.length;
Set<String> set = Sets.newHashSet();
Prob1DMap map = new Prob1DMap();
DEPTree tree;
LOG.info(String.format("Collecting simplified-forms: cutoff = %d, max = %d\n", cutoff, maxCount));
for (i=0; i<size; i++)
{
if (i == devId) continue;
reader.open(UTInput.createBufferedFileReader(filenames[i]));
while ((tree = reader.next()) != null)
{
len = tree.size();
for (j=1; j<len; j++)
set.add(MPLib.getSimplifiedLowercaseWordForm(tree.get(j).form));
if ((count += len) >= maxCount)
{
map.addAll(set);
LOG.info(".");
set.clear();
count = 0;
}
}
reader.close();
} LOG.info("\n");
if (!set.isEmpty()) map.addAll(set);
return map.toSet(cutoff);
}