int dfc = getDocumentFrequencyCutoff(eTrain);
int dtc = getDocumentTokenCount(eTrain);
DEPTree tree;
Set<String> sLsfs = getSimplifiedFormsByDocumentFrequencies(reader, trainFiles, dfc, dtc);
EnglishOnlinePOSTagger component = new EnglishOnlinePOSTagger(xmls, sLsfs);
LOG.info("Collecting lexica:");
int total = 0;
for (String trainFile : trainFiles)
{
reader.open(UTInput.createBufferedFileReader(trainFile));
while ((tree = reader.next()) != null)
{
component.collect(tree);
if (++total%5000 == 0) LOG.info(".");
}
reader.close();
} LOG.info("\n");
LOG.info("Trainig:");
Object[] lexica = component.getLexica();
component = new EnglishOnlinePOSTagger(xmls, lexica);
for (String trainFile : trainFiles)
{
reader.open(UTInput.createBufferedFileReader(trainFile));
while ((tree = reader.next()) != null)
{
component.train(tree);
if (++total%5000 == 0) LOG.info(".");
}
reader.close();
}
IOnlineAlgorithm algorithm = new OnlineAdaGradHinge(0.01, 0.1);
List<DEPTree> devTrees = getTrees(reader, devFiles);
component.develop(LOG, algorithm, 5, devTrees);
LOG.info("Bootsrapping:");
for (String trainFile : trainFiles)
{
reader.open(UTInput.createBufferedFileReader(trainFile));
while ((tree = reader.next()) != null)
{
component.bootstrap(tree);
if (++total%5000 == 0) LOG.info(".");
}
reader.close();
}
component.develop(LOG, algorithm, 5, devTrees);
}