CommandOption.process(Clusterings2Clusterer.class, args);
// TRAIN
Randoms random = new Randoms(123);
Clusterer clusterer = null;
if (!loadClusterer.value.exists()) {
Clusterings training = readClusterings(trainingFile.value);
Alphabet fieldAlphabet = ((Record) training.get(0).getInstances()
.get(0).getData()).fieldAlphabet();
Pipe pipe = new ClusteringPipe(string2ints(exactMatchFields.value, fieldAlphabet),
string2ints(approxMatchFields.value, fieldAlphabet),
string2ints(substringMatchFields.value, fieldAlphabet));
InstanceList trainingInstances = new InstanceList(pipe);
for (int i = 0; i < training.size(); i++) {
PairSampleIterator iterator = new PairSampleIterator(training
.get(i), random, 0.5, training.get(i).getNumInstances());
while(iterator.hasNext()) {
Instance inst = iterator.next();
trainingInstances.add(pipe.pipe(inst));
}
}
logger.info("generated " + trainingInstances.size()
+ " training instances");
Classifier classifier = new MaxEntTrainer().train(trainingInstances);
logger.info("InfoGain:\n");
new InfoGain(trainingInstances).printByRank(System.out);
logger.info("pairwise training accuracy="
+ new Trial(classifier, trainingInstances).getAccuracy());
NeighborEvaluator neval = new PairwiseEvaluator(classifier, "YES",
new PairwiseEvaluator.Average(), true);
clusterer = new GreedyAgglomerativeByDensity(
training.get(0).getInstances().getPipe(), neval, 0.5, false,
random);
training = null;
trainingInstances = null;
} else {
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(loadClusterer.value));
clusterer = (Clusterer) ois.readObject();
}
// TEST
Clusterings testing = readClusterings(testingFile.value);
ClusteringEvaluator evaluator = (ClusteringEvaluator) clusteringEvaluatorOption.value;
if (evaluator == null)
evaluator = new ClusteringEvaluators(
new ClusteringEvaluator[] { new BCubedEvaluator(),
new PairF1Evaluator(), new MUCEvaluator(), new AccuracyEvaluator() });
ArrayList<Clustering> predictions = new ArrayList<Clustering>();
for (int i = 0; i < testing.size(); i++) {
Clustering clustering = testing.get(i);
Clustering predicted = clusterer.cluster(clustering.getInstances());
predictions.add(predicted);
logger.info(evaluator.evaluate(clustering, predicted));
}
logger.info(evaluator.evaluateTotals());