log.info("Testing Complementary Bayes Classifier");
model = new CBayesModel();
} else {
throw new IllegalArgumentException("Unrecognized classifier type: " + classifierType);
}
Classifier classifier = new BayesClassifier();
SequenceFileModelReader.loadModel(model, fs, modelPaths, conf);
log.info("Done loading model: # labels: {}", model.getLabels().size());
log.info("Done generating Model");
String defaultCat = "unknown";
if (cmdLine.hasOption(defaultCatOpt)) {
defaultCat = (String) cmdLine.getValue(defaultCatOpt);
}
File docPath = new File((String) cmdLine.getValue(classifyOpt));
String encoding = "UTF-8";
if (cmdLine.hasOption(encodingOpt)) {
encoding = (String) cmdLine.getValue(encodingOpt);
}
Analyzer analyzer = null;
if (cmdLine.hasOption(analyzerOpt)) {
String className = (String) cmdLine.getValue(analyzerOpt);
analyzer = Class.forName(className).asSubclass(Analyzer.class).newInstance();
}
if (analyzer == null) {
analyzer = new StandardAnalyzer();
}
int gramSize = 1;
if (cmdLine.hasOption(gramSizeOpt)) {
gramSize = Integer.parseInt((String) cmdLine
.getValue(gramSizeOpt));
}
log.info("Converting input document to proper format");
String[] document = BayesFileFormatter.readerToDocument(analyzer, new InputStreamReader(new FileInputStream(docPath), Charset.forName(encoding)));
StringBuilder line = new StringBuilder();
for(String token : document)
{
line.append(token).append(' ');
}
List<String> doc = Model.generateNGramsWithoutLabel(line.toString(), gramSize) ;
log.info("Done converting");
log.info("Classifying document: {}", docPath);
ClassifierResult category = classifier.classify(model, doc.toArray(new String[doc.size()]), defaultCat);
log.info("Category for {} is {}", docPath, category);
}