}
} else {
throw new IllegalArgumentException("Unrecognized dataSource type: " + dataSource);
}
ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
classifier.initialize();
String defaultCat = "unknown";
if (cmdLine.hasOption(defaultCatOpt)) {
defaultCat = (String) cmdLine.getValue(defaultCatOpt);
}
File docPath = new File((String) cmdLine.getValue(classifyOpt));
String encoding = "UTF-8";
if (cmdLine.hasOption(encodingOpt)) {
encoding = (String) cmdLine.getValue(encodingOpt);
}
Analyzer analyzer = null;
if (cmdLine.hasOption(analyzerOpt)) {
String className = (String) cmdLine.getValue(analyzerOpt);
analyzer = Class.forName(className).asSubclass(Analyzer.class).newInstance();
}
if (analyzer == null) {
analyzer = new StandardAnalyzer(Version.LUCENE_30);
}
log.info("Converting input document to proper format");
String[] document = BayesFileFormatter.readerToDocument(analyzer, new InputStreamReader(
new FileInputStream(docPath), Charset.forName(encoding)));
StringBuilder line = new StringBuilder();
for (String token : document) {
line.append(token).append(' ');
}
List<String> doc = new NGrams(line.toString(), gramSize).generateNGramsWithoutLabel();
log.info("Done converting");
log.info("Classifying document: {}", docPath);
ClassifierResult category = classifier.classifyDocument(doc.toArray(new String[doc.size()]), defaultCat);
log.info("Category for {} is {}", docPath, category);
}