logger.info("Training new model from " + inFile.getAbsolutePath());
logger.info("Using " + numEosc + " end of sentence characters.");
Charset charset = Charset.forName("UTF-8");
SentenceModel mod = null;
try(FileInputStream inStream = new FileInputStream(inFile)){
ObjectStream<String> lineStream = new PlainTextByLineStream(inStream, charset);
ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
// Training Parameters
TrainingParameters mlParams = new TrainingParameters();
mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iters));
mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
// Abbreviations dictionary
// TODO: Actually import a Dictionary of abbreviations
Dictionary dict = new Dictionary();
try {
mod = SentenceDetectorME.train("en", sampleStream, true, dict, mlParams);
} finally {
sampleStream.close();
}
}
try(FileOutputStream outStream = new FileOutputStream(outFile)){
logger.info("Saving the model as: " + outFile.getAbsolutePath());
mod.serialize(outStream);
}
}