/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* ClusterEvaluation.java
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
*
*/
package weka.clusterers;
import weka.core.Drawable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;
import java.beans.BeanInfo;
import java.beans.Introspector;
import java.beans.MethodDescriptor;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.Serializable;
import java.lang.reflect.Method;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
/**
* Class for evaluating clustering models.<p/>
*
* Valid options are: <p/>
*
* -t name of the training file <br/>
* Specify the training file. <p/>
*
* -T name of the test file <br/>
* Specify the test file to apply clusterer to. <p/>
*
* -d name of file to save clustering model to <br/>
* Specify output file. <p/>
*
* -l name of file to load clustering model from <br/>
* Specifiy input file. <p/>
*
* -p attribute range <br/>
* Output predictions. Predictions are for the training file if only the
* training file is specified, otherwise they are for the test file. The range
* specifies attribute values to be output with the predictions.
* Use '-p 0' for none. <p/>
*
* -x num folds <br/>
* Set the number of folds for a cross validation of the training data.
* Cross validation can only be done for distribution clusterers and will
* be performed if the test file is missing. <p/>
*
* -s num <br/>
* Sets the seed for randomizing the data for cross-validation. <p/>
*
* -c class <br/>
* Set the class attribute. If set, then class based evaluation of clustering
* is performed. <p/>
*
* -g name of graph file <br/>
* Outputs the graph representation of the clusterer to the file. Only for
* clusterer that implemented the <code>weka.core.Drawable</code> interface.
* <p/>
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision: 7756 $
* @see weka.core.Drawable
*/
public class ClusterEvaluation
implements Serializable, RevisionHandler {
/** for serialization */
static final long serialVersionUID = -830188327319128005L;
/** the clusterer */
private Clusterer m_Clusterer;
/** holds a string describing the results of clustering the training data */
private StringBuffer m_clusteringResults;
/** holds the number of clusters found by the clusterer */
private int m_numClusters;
/** holds the assigments of instances to clusters for a particular testing
dataset */
private double[] m_clusterAssignments;
/** holds the average log likelihood for a particular testing dataset
if the clusterer is a DensityBasedClusterer */
private double m_logL;
/** will hold the mapping of classes to clusters (for class based
evaluation) */
private int[] m_classToCluster = null;
/**
* set the clusterer
* @param clusterer the clusterer to use
*/
public void setClusterer(Clusterer clusterer) {
m_Clusterer = clusterer;
}
/**
* return the results of clustering.
* @return a string detailing the results of clustering a data set
*/
public String clusterResultsToString() {
return m_clusteringResults.toString();
}
/**
* Return the number of clusters found for the most recent call to
* evaluateClusterer
* @return the number of clusters found
*/
public int getNumClusters() {
return m_numClusters;
}
/**
* Return an array of cluster assignments corresponding to the most
* recent set of instances clustered.
* @return an array of cluster assignments
*/
public double[] getClusterAssignments() {
return m_clusterAssignments;
}
/**
* Return the array (ordered by cluster number) of minimum error class to
* cluster mappings
* @return an array of class to cluster mappings
*/
public int[] getClassesToClusters() {
return m_classToCluster;
}
/**
* Return the log likelihood corresponding to the most recent
* set of instances clustered.
*
* @return a <code>double</code> value
*/
public double getLogLikelihood() {
return m_logL;
}
/**
* Constructor. Sets defaults for each member variable. Default Clusterer
* is EM.
*/
public ClusterEvaluation () {
setClusterer(new SimpleKMeans());
m_clusteringResults = new StringBuffer();
m_clusterAssignments = null;
}
/**
* Evaluate the clusterer on a set of instances. Calculates clustering
* statistics and stores cluster assigments for the instances in
* m_clusterAssignments
*
* @param test the set of instances to cluster
* @throws Exception if something goes wrong
*/
public void evaluateClusterer(Instances test) throws Exception {
evaluateClusterer(test, "");
}
/**
* Evaluate the clusterer on a set of instances. Calculates clustering
* statistics and stores cluster assigments for the instances in
* m_clusterAssignments
*
* @param test the set of instances to cluster
* @param testFileName the name of the test file for incremental testing,
* if "" or null then not used
*
* @throws Exception if something goes wrong
*/
public void evaluateClusterer(Instances test, String testFileName)
throws Exception {
evaluateClusterer(test, testFileName, true);
}
/**
* Evaluate the clusterer on a set of instances. Calculates clustering
* statistics and stores cluster assigments for the instances in
* m_clusterAssignments
*
* @param test the set of instances to cluster
* @param testFileName the name of the test file for incremental testing,
* if "" or null then not used
* @param outputModel true if the clustering model is to be output as well
* as the stats
*
* @throws Exception if something goes wrong
*/
public void evaluateClusterer(Instances test, String testFileName,
boolean outputModel) throws Exception {
int i = 0;
int cnum;
double loglk = 0.0;
int cc = m_Clusterer.numberOfClusters();
m_numClusters = cc;
double[] instanceStats = new double[cc];
Instances testRaw = null;
boolean hasClass = (test.classIndex() >= 0);
int unclusteredInstances = 0;
Vector<Double> clusterAssignments = new Vector<Double>();
Filter filter = null;
DataSource source = null;
Instance inst;
if (testFileName == null)
testFileName = "";
// load data
if (testFileName.length() != 0)
source = new DataSource(testFileName);
else
source = new DataSource(test);
testRaw = source.getStructure(test.classIndex());
// If class is set then do class based evaluation as well
if (hasClass) {
if (testRaw.classAttribute().isNumeric())
throw new Exception("ClusterEvaluation: Class must be nominal!");
filter = new Remove();
((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1));
((Remove) filter).setInvertSelection(false);
filter.setInputFormat(testRaw);
}
i = 0;
while (source.hasMoreElements(testRaw)) {
// next instance
inst = source.nextElement(testRaw);
if (filter != null) {
filter.input(inst);
filter.batchFinished();
inst = filter.output();
}
cnum = -1;
try {
if (m_Clusterer instanceof DensityBasedClusterer) {
loglk += ((DensityBasedClusterer)m_Clusterer).
logDensityForInstance(inst);
cnum = m_Clusterer.clusterInstance(inst);
clusterAssignments.add((double) cnum);
}
else {
cnum = m_Clusterer.clusterInstance(inst);
clusterAssignments.add((double) cnum);
}
}
catch (Exception e) {
clusterAssignments.add(-1.0);
unclusteredInstances++;
}
if (cnum != -1) {
instanceStats[cnum]++;
}
}
double sum = Utils.sum(instanceStats);
loglk /= sum;
m_logL = loglk;
m_clusterAssignments = new double [clusterAssignments.size()];
for (i = 0; i < clusterAssignments.size(); i++) {
m_clusterAssignments[i] = clusterAssignments.get(i);
}
int numInstFieldWidth = (int)((Math.log(clusterAssignments.size())/Math.log(10))+1);
if (outputModel) {
m_clusteringResults.append(m_Clusterer.toString());
}
m_clusteringResults.append("Clustered Instances\n\n");
int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
for (i = 0; i < cc; i++) {
if (instanceStats[i] > 0)
m_clusteringResults.append(Utils.doubleToString((double)i,
clustFieldWidth, 0)
+ " "
+ Utils.doubleToString(instanceStats[i],
numInstFieldWidth, 0)
+ " ("
+ Utils.doubleToString((instanceStats[i] /
sum * 100.0)
, 3, 0) + "%)\n");
}
if (unclusteredInstances > 0)
m_clusteringResults.append("\nUnclustered instances : "
+unclusteredInstances);
if (m_Clusterer instanceof DensityBasedClusterer)
m_clusteringResults.append("\n\nLog likelihood: "
+ Utils.doubleToString(loglk, 1, 5)
+ "\n");
if (hasClass) {
evaluateClustersWithRespectToClass(test, testFileName);
}
}
/**
* Evaluates cluster assignments with respect to actual class labels.
* Assumes that m_Clusterer has been trained and tested on
* inst (minus the class).
*
* @param inst the instances (including class) to evaluate with respect to
* @param fileName the name of the test file for incremental testing,
* if "" or null then not used
* @throws Exception if something goes wrong
*/
private void evaluateClustersWithRespectToClass(Instances inst, String fileName)
throws Exception {
int numClasses = inst.classAttribute().numValues();
int[][] counts = new int [m_numClusters][numClasses];
int[] clusterTotals = new int[m_numClusters];
double[] best = new double[m_numClusters+1];
double[] current = new double[m_numClusters+1];
DataSource source = null;
Instances instances = null;
Instance instance = null;
int i;
int numInstances;
if (fileName == null)
fileName = "";
if (fileName.length() != 0) {
source = new DataSource(fileName);
}
else
source = new DataSource(inst);
instances = source.getStructure(inst.classIndex());
i = 0;
while (source.hasMoreElements(instances)) {
instance = source.nextElement(instances);
if (m_clusterAssignments[i] >= 0) {
counts[(int)m_clusterAssignments[i]][(int)instance.classValue()]++;
clusterTotals[(int)m_clusterAssignments[i]]++;
}
i++;
}
numInstances = i;
best[m_numClusters] = Double.MAX_VALUE;
mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0);
m_clusteringResults.append("\n\nClass attribute: "
+inst.classAttribute().name()
+"\n");
m_clusteringResults.append("Classes to Clusters:\n");
String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0));
m_clusteringResults.append(matrixString).append("\n");
int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10));
// add the minimum error assignment
for (i = 0; i < m_numClusters; i++) {
if (clusterTotals[i] > 0) {
m_clusteringResults.append("Cluster "
+Utils.doubleToString((double)i,Cwidth,0));
m_clusteringResults.append(" <-- ");
if (best[i] < 0) {
m_clusteringResults.append("No class\n");
} else {
m_clusteringResults.
append(inst.classAttribute().value((int)best[i])).append("\n");
}
}
}
m_clusteringResults.append("\nIncorrectly clustered instances :\t"
+best[m_numClusters]+"\t"
+(Utils.doubleToString((best[m_numClusters] /
numInstances *
100.0), 8, 4))
+" %\n");
// copy the class assignments
m_classToCluster = new int [m_numClusters];
for (i = 0; i < m_numClusters; i++) {
m_classToCluster[i] = (int)best[i];
}
}
/**
* Returns a "confusion" style matrix of classes to clusters assignments
* @param counts the counts of classes for each cluster
* @param clusterTotals total number of examples in each cluster
* @param inst the training instances (with class)
* @return the "confusion" style matrix as string
* @throws Exception if matrix can't be generated
*/
private String toMatrixString(int[][] counts, int[] clusterTotals,
Instances inst)
throws Exception {
StringBuffer ms = new StringBuffer();
int maxval = 0;
for (int i = 0; i < m_numClusters; i++) {
for (int j = 0; j < counts[i].length; j++) {
if (counts[i][j] > maxval) {
maxval = counts[i][j];
}
}
}
int Cwidth = 1 + Math.max((int)(Math.log(maxval) / Math.log(10)),
(int)(Math.log(m_numClusters) / Math.log(10)));
ms.append("\n");
for (int i = 0; i < m_numClusters; i++) {
if (clusterTotals[i] > 0) {
ms.append(" ").append(Utils.doubleToString((double)i, Cwidth, 0));
}
}
ms.append(" <-- assigned to cluster\n");
for (int i = 0; i< counts[0].length; i++) {
for (int j = 0; j < m_numClusters; j++) {
if (clusterTotals[j] > 0) {
ms.append(" ").append(Utils.doubleToString((double)counts[j][i],
Cwidth, 0));
}
}
ms.append(" | ").append(inst.classAttribute().value(i)).append("\n");
}
return ms.toString();
}
/**
* Finds the minimum error mapping of classes to clusters. Recursively
* considers all possible class to cluster assignments.
*
* @param numClusters the number of clusters
* @param lev the cluster being processed
* @param counts the counts of classes in clusters
* @param clusterTotals the total number of examples in each cluster
* @param current the current path through the class to cluster assignment
* tree
* @param best the best assignment path seen
* @param error accumulates the error for a particular path
*/
public static void mapClasses(int numClusters, int lev, int[][] counts, int[] clusterTotals,
double[] current, double[] best, int error) {
// leaf
if (lev == numClusters) {
if (error < best[numClusters]) {
best[numClusters] = error;
for (int i = 0; i < numClusters; i++) {
best[i] = current[i];
}
}
} else {
// empty cluster -- ignore
if (clusterTotals[lev] == 0) {
current[lev] = -1; // cluster ignored
mapClasses(numClusters, lev+1, counts, clusterTotals, current, best,
error);
} else {
// first try no class assignment to this cluster
current[lev] = -1; // cluster assigned no class (ie all errors)
mapClasses(numClusters, lev+1, counts, clusterTotals, current, best,
error+clusterTotals[lev]);
// now loop through the classes in this cluster
for (int i = 0; i < counts[0].length; i++) {
if (counts[lev][i] > 0) {
boolean ok = true;
// check to see if this class has already been assigned
for (int j = 0; j < lev; j++) {
if ((int)current[j] == i) {
ok = false;
break;
}
}
if (ok) {
current[lev] = i;
mapClasses(numClusters, lev+1, counts, clusterTotals, current, best,
(error + (clusterTotals[lev] - counts[lev][i])));
}
}
}
}
}
}
/**
* Evaluates a clusterer with the options given in an array of
* strings. It takes the string indicated by "-t" as training file, the
* string indicated by "-T" as test file.
* If the test file is missing, a stratified ten-fold
* cross-validation is performed (distribution clusterers only).
* Using "-x" you can change the number of
* folds to be used, and using "-s" the random seed.
* If the "-p" option is present it outputs the classification for
* each test instance. If you provide the name of an object file using
* "-l", a clusterer will be loaded from the given file. If you provide the
* name of an object file using "-d", the clusterer built from the
* training data will be saved to the given file.
*
* @param clusterer machine learning clusterer
* @param options the array of string containing the options
* @throws Exception if model could not be evaluated successfully
* @return a string describing the results
*/
public static String evaluateClusterer(Clusterer clusterer, String[] options)
throws Exception {
int seed = 1, folds = 10;
boolean doXval = false;
Instances train = null;
Random random;
String trainFileName, testFileName, seedString, foldsString;
String objectInputFileName, objectOutputFileName, attributeRangeString;
String graphFileName;
String[] savedOptions = null;
boolean printClusterAssignments = false;
Range attributesToOutput = null;
StringBuffer text = new StringBuffer();
int theClass = -1; // class based evaluation of clustering
boolean updateable = (clusterer instanceof UpdateableClusterer);
DataSource source = null;
Instance inst;
if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) {
// global info requested as well?
boolean globalInfo = Utils.getFlag("synopsis", options) ||
Utils.getFlag("info", options);
throw new Exception("Help requested."
+ makeOptionString(clusterer, globalInfo));
}
try {
// Get basic options (options the same for all clusterers
//printClusterAssignments = Utils.getFlag('p', options);
objectInputFileName = Utils.getOption('l', options);
objectOutputFileName = Utils.getOption('d', options);
trainFileName = Utils.getOption('t', options);
testFileName = Utils.getOption('T', options);
graphFileName = Utils.getOption('g', options);
// Check -p option
try {
attributeRangeString = Utils.getOption('p', options);
}
catch (Exception e) {
throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. " +
"It now expects a parameter specifying a range of attributes " +
"to list with the predictions. Use '-p 0' for none.");
}
if (attributeRangeString.length() != 0) {
printClusterAssignments = true;
if (!attributeRangeString.equals("0"))
attributesToOutput = new Range(attributeRangeString);
}
if (trainFileName.length() == 0) {
if (objectInputFileName.length() == 0) {
throw new Exception("No training file and no object "
+ "input file given.");
}
if (testFileName.length() == 0) {
throw new Exception("No training file and no test file given.");
}
}
else {
if ((objectInputFileName.length() != 0)
&& (printClusterAssignments == false)) {
throw new Exception("Can't use both train and model file "
+ "unless -p specified.");
}
}
seedString = Utils.getOption('s', options);
if (seedString.length() != 0) {
seed = Integer.parseInt(seedString);
}
foldsString = Utils.getOption('x', options);
if (foldsString.length() != 0) {
folds = Integer.parseInt(foldsString);
doXval = true;
}
}
catch (Exception e) {
throw new Exception('\n' + e.getMessage()
+ makeOptionString(clusterer, false));
}
try {
if (trainFileName.length() != 0) {
source = new DataSource(trainFileName);
train = source.getStructure();
String classString = Utils.getOption('c',options);
if (classString.length() != 0) {
if (classString.compareTo("last") == 0)
theClass = train.numAttributes();
else if (classString.compareTo("first") == 0)
theClass = 1;
else
theClass = Integer.parseInt(classString);
if (theClass != -1) {
if (doXval || testFileName.length() != 0)
throw new Exception("Can only do class based evaluation on the "
+"training data");
if (objectInputFileName.length() != 0)
throw new Exception("Can't load a clusterer and do class based "
+"evaluation");
if (objectOutputFileName.length() != 0)
throw new Exception(
"Can't do class based evaluation and save clusterer");
}
}
else {
// if the dataset defines a class attribute, use it
if (train.classIndex() != -1) {
theClass = train.classIndex() + 1;
System.err.println(
"Note: using class attribute from dataset, i.e., attribute #"
+ theClass);
}
}
if (theClass != -1) {
if (theClass < 1 || theClass > train.numAttributes())
throw new Exception("Class is out of range!");
if (!train.attribute(theClass - 1).isNominal())
throw new Exception("Class must be nominal!");
train.setClassIndex(theClass - 1);
}
}
}
catch (Exception e) {
throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
}
// Save options
if (options != null) {
savedOptions = new String[options.length];
System.arraycopy(options, 0, savedOptions, 0, options.length);
}
if (objectInputFileName.length() != 0)
Utils.checkForRemainingOptions(options);
// Set options for clusterer
if (clusterer instanceof OptionHandler)
((OptionHandler)clusterer).setOptions(options);
Utils.checkForRemainingOptions(options);
Instances trainHeader = train;
if (objectInputFileName.length() != 0) {
// Load the clusterer from file
// clusterer = (Clusterer) SerializationHelper.read(objectInputFileName);
java.io.ObjectInputStream ois =
new java.io.ObjectInputStream(
new java.io.BufferedInputStream(
new java.io.FileInputStream(objectInputFileName)));
clusterer = (Clusterer) ois.readObject();
// try and get the training header
try {
trainHeader = (Instances) ois.readObject();
} catch (Exception ex) {
// don't moan if we cant
}
}
else {
// Build the clusterer if no object file provided
if (theClass == -1) {
if (updateable) {
clusterer.buildClusterer(source.getStructure());
while (source.hasMoreElements(train)) {
inst = source.nextElement(train);
((UpdateableClusterer) clusterer).updateClusterer(inst);
}
((UpdateableClusterer) clusterer).updateFinished();
}
else {
clusterer.buildClusterer(source.getDataSet());
}
}
else {
Remove removeClass = new Remove();
removeClass.setAttributeIndices("" + theClass);
removeClass.setInvertSelection(false);
removeClass.setInputFormat(train);
if (updateable) {
Instances clusterTrain = Filter.useFilter(train, removeClass);
clusterer.buildClusterer(clusterTrain);
trainHeader = clusterTrain;
while (source.hasMoreElements(train)) {
inst = source.nextElement(train);
removeClass.input(inst);
removeClass.batchFinished();
Instance clusterTrainInst = removeClass.output();
((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst);
}
((UpdateableClusterer) clusterer).updateFinished();
}
else {
Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass);
clusterer.buildClusterer(clusterTrain);
trainHeader = clusterTrain;
}
ClusterEvaluation ce = new ClusterEvaluation();
ce.setClusterer(clusterer);
ce.evaluateClusterer(train, trainFileName);
return "\n\n=== Clustering stats for training data ===\n\n" +
ce.clusterResultsToString();
}
}
/* Output cluster predictions only (for the test data if specified,
otherwise for the training data */
if (printClusterAssignments) {
return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput);
}
text.append(clusterer.toString());
text.append("\n\n=== Clustering stats for training data ===\n\n"
+ printClusterStats(clusterer, trainFileName));
if (testFileName.length() != 0) {
// check header compatibility
DataSource test = new DataSource(testFileName);
Instances testStructure = test.getStructure();
if (!trainHeader.equalHeaders(testStructure)) {
throw new Exception("Training and testing data are not compatible\n" + trainHeader.equalHeadersMsg(testStructure));
}
text.append("\n\n=== Clustering stats for testing data ===\n\n"
+ printClusterStats(clusterer, testFileName));
}
if ((clusterer instanceof DensityBasedClusterer) &&
(doXval == true) &&
(testFileName.length() == 0) &&
(objectInputFileName.length() == 0)) {
// cross validate the log likelihood on the training data
random = new Random(seed);
random.setSeed(seed);
train = source.getDataSet();
train.randomize(random);
text.append(
crossValidateModel(
clusterer.getClass().getName(), train, folds, savedOptions, random));
}
// Save the clusterer if an object output file is provided
if (objectOutputFileName.length() != 0) {
//SerializationHelper.write(objectOutputFileName, clusterer);
saveClusterer(objectOutputFileName, clusterer, trainHeader);
}
// If classifier is drawable output string describing graph
if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) {
BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName));
writer.write(((Drawable) clusterer).graph());
writer.newLine();
writer.flush();
writer.close();
}
return text.toString();
}
private static void saveClusterer(String fileName,
Clusterer clusterer,
Instances header) throws Exception {
java.io.ObjectOutputStream oos =
new java.io.ObjectOutputStream(
new java.io.BufferedOutputStream(
new java.io.FileOutputStream(fileName)));
oos.writeObject(clusterer);
if (header != null) {
oos.writeObject(header);
}
oos.flush();
oos.close();
}
/**
* Perform a cross-validation for DensityBasedClusterer on a set of instances.
*
* @param clusterer the clusterer to use
* @param data the training data
* @param numFolds number of folds of cross validation to perform
* @param random random number seed for cross-validation
* @return the cross-validated log-likelihood
* @throws Exception if an error occurs
*/
public static double crossValidateModel(DensityBasedClusterer clusterer,
Instances data,
int numFolds,
Random random) throws Exception {
Instances train, test;
double foldAv = 0;;
data = new Instances(data);
data.randomize(random);
// double sumOW = 0;
for (int i = 0; i < numFolds; i++) {
// Build and test clusterer
train = data.trainCV(numFolds, i, random);
clusterer.buildClusterer(train);
test = data.testCV(numFolds, i);
for (int j = 0; j < test.numInstances(); j++) {
try {
foldAv += ((DensityBasedClusterer)clusterer).
logDensityForInstance(test.instance(j));
// sumOW += test.instance(j).weight();
// double temp = Utils.sum(tempDist);
} catch (Exception ex) {
// unclustered instances
}
}
}
// return foldAv / sumOW;
return foldAv / data.numInstances();
}
/**
* Performs a cross-validation
* for a DensityBasedClusterer clusterer on a set of instances.
*
* @param clustererString a string naming the class of the clusterer
* @param data the data on which the cross-validation is to be
* performed
* @param numFolds the number of folds for the cross-validation
* @param options the options to the clusterer
* @param random a random number generator
* @return a string containing the cross validated log likelihood
* @throws Exception if a clusterer could not be generated
*/
public static String crossValidateModel (String clustererString,
Instances data,
int numFolds,
String[] options,
Random random)
throws Exception {
Clusterer clusterer = null;
String[] savedOptions = null;
double CvAv = 0.0;
StringBuffer CvString = new StringBuffer();
if (options != null) {
savedOptions = new String[options.length];
}
data = new Instances(data);
// create clusterer
try {
clusterer = (Clusterer)Class.forName(clustererString).newInstance();
}
catch (Exception e) {
throw new Exception("Can't find class with name "
+ clustererString + '.');
}
if (!(clusterer instanceof DensityBasedClusterer)) {
throw new Exception(clustererString
+ " must be a distrinbution "
+ "clusterer.");
}
// Save options
if (options != null) {
System.arraycopy(options, 0, savedOptions, 0, options.length);
}
// Parse options
if (clusterer instanceof OptionHandler) {
try {
((OptionHandler)clusterer).setOptions(savedOptions);
Utils.checkForRemainingOptions(savedOptions);
}
catch (Exception e) {
throw new Exception("Can't parse given options in "
+ "cross-validation!");
}
}
CvAv = crossValidateModel((DensityBasedClusterer)clusterer, data, numFolds, random);
CvString.append("\n" + numFolds
+ " fold CV Log Likelihood: "
+ Utils.doubleToString(CvAv, 6, 4)
+ "\n");
return CvString.toString();
}
// ===============
// Private methods
// ===============
/**
* Print the cluster statistics for either the training
* or the testing data.
*
* @param clusterer the clusterer to use for generating statistics.
* @param fileName the file to load
* @return a string containing cluster statistics.
* @throws Exception if statistics can't be generated.
*/
private static String printClusterStats (Clusterer clusterer,
String fileName)
throws Exception {
StringBuffer text = new StringBuffer();
int i = 0;
int cnum;
double loglk = 0.0;
int cc = clusterer.numberOfClusters();
double[] instanceStats = new double[cc];
int unclusteredInstances = 0;
if (fileName.length() != 0) {
DataSource source = new DataSource(fileName);
Instances structure = source.getStructure();
Instance inst;
while (source.hasMoreElements(structure)) {
inst = source.nextElement(structure);
try {
cnum = clusterer.clusterInstance(inst);
if (clusterer instanceof DensityBasedClusterer) {
loglk += ((DensityBasedClusterer)clusterer).
logDensityForInstance(inst);
// temp = Utils.sum(dist);
}
instanceStats[cnum]++;
}
catch (Exception e) {
unclusteredInstances++;
}
i++;
}
/*
// count the actual number of used clusters
int count = 0;
for (i = 0; i < cc; i++) {
if (instanceStats[i] > 0) {
count++;
}
}
if (count > 0) {
double[] tempStats = new double [count];
count=0;
for (i=0;i<cc;i++) {
if (instanceStats[i] > 0) {
tempStats[count++] = instanceStats[i];
}
}
instanceStats = tempStats;
cc = instanceStats.length;
} */
int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
int numInstFieldWidth = (int)((Math.log(i)/Math.log(10))+1);
double sum = Utils.sum(instanceStats);
loglk /= sum;
text.append("Clustered Instances\n");
for (i = 0; i < cc; i++) {
if (instanceStats[i] > 0) {
text.append(Utils.doubleToString((double)i,
clustFieldWidth, 0)
+ " "
+ Utils.doubleToString(instanceStats[i],
numInstFieldWidth, 0)
+ " ("
+ Utils.doubleToString((instanceStats[i]/sum*100.0)
, 3, 0) + "%)\n");
}
}
if (unclusteredInstances > 0) {
text.append("\nUnclustered Instances : "+unclusteredInstances);
}
if (clusterer instanceof DensityBasedClusterer) {
text.append("\n\nLog likelihood: "
+ Utils.doubleToString(loglk, 1, 5)
+ "\n");
}
}
return text.toString();
}
/**
* Print the cluster assignments for either the training
* or the testing data.
*
* @param clusterer the clusterer to use for cluster assignments
* @param trainFileName the train file
* @param testFileName an optional test file
* @param attributesToOutput the attributes to print
* @return a string containing the instance indexes and cluster assigns.
* @throws Exception if cluster assignments can't be printed
*/
private static String printClusterings (Clusterer clusterer, String trainFileName,
String testFileName, Range attributesToOutput)
throws Exception {
StringBuffer text = new StringBuffer();
int i = 0;
int cnum;
DataSource source = null;
Instance inst;
Instances structure;
if (testFileName.length() != 0)
source = new DataSource(testFileName);
else
source = new DataSource(trainFileName);
structure = source.getStructure();
while (source.hasMoreElements(structure)) {
inst = source.nextElement(structure);
try {
cnum = clusterer.clusterInstance(inst);
text.append(i + " " + cnum + " "
+ attributeValuesString(inst, attributesToOutput) + "\n");
}
catch (Exception e) {
/* throw new Exception('\n' + "Unable to cluster instance\n"
+ e.getMessage()); */
text.append(i + " Unclustered "
+ attributeValuesString(inst, attributesToOutput) + "\n");
}
i++;
}
return text.toString();
}
/**
* Builds a string listing the attribute values in a specified range of indices,
* separated by commas and enclosed in brackets.
*
* @param instance the instance to print the values from
* @param attRange the range of the attributes to list
* @return a string listing values of the attributes in the range
*/
private static String attributeValuesString(Instance instance, Range attRange) {
StringBuffer text = new StringBuffer();
if (attRange != null) {
boolean firstOutput = true;
attRange.setUpper(instance.numAttributes() - 1);
for (int i=0; i<instance.numAttributes(); i++)
if (attRange.isInRange(i)) {
if (firstOutput) text.append("(");
else text.append(",");
text.append(instance.toString(i));
firstOutput = false;
}
if (!firstOutput) text.append(")");
}
return text.toString();
}
/**
* Make up the help string giving all the command line options
*
* @param clusterer the clusterer to include options for
* @return a string detailing the valid command line options
*/
private static String makeOptionString (Clusterer clusterer,
boolean globalInfo) {
StringBuffer optionsText = new StringBuffer("");
// General options
optionsText.append("\n\nGeneral options:\n\n");
optionsText.append("-h or -help\n");
optionsText.append("\tOutput help information.\n");
optionsText.append("-synopsis or -info\n");
optionsText.append("\tOutput synopsis for clusterer (use in conjunction "
+ " with -h)\n");
optionsText.append("-t <name of training file>\n");
optionsText.append("\tSets training file.\n");
optionsText.append("-T <name of test file>\n");
optionsText.append("\tSets test file.\n");
optionsText.append("-l <name of input file>\n");
optionsText.append("\tSets model input file.\n");
optionsText.append("-d <name of output file>\n");
optionsText.append("\tSets model output file.\n");
optionsText.append("-p <attribute range>\n");
optionsText.append("\tOutput predictions. Predictions are for "
+ "training file"
+ "\n\tif only training file is specified,"
+ "\n\totherwise predictions are for the test file."
+ "\n\tThe range specifies attribute values to be output"
+ "\n\twith the predictions. Use '-p 0' for none.\n");
optionsText.append("-x <number of folds>\n");
optionsText.append("\tOnly Distribution Clusterers can be cross validated.\n");
optionsText.append("-s <random number seed>\n");
optionsText.append("\tSets the seed for randomizing the data in cross-validation\n");
optionsText.append("-c <class index>\n");
optionsText.append("\tSet class attribute. If supplied, class is ignored");
optionsText.append("\n\tduring clustering but is used in a classes to");
optionsText.append("\n\tclusters evaluation.\n");
if (clusterer instanceof Drawable) {
optionsText.append("-g <name of graph file>\n");
optionsText.append("\tOutputs the graph representation of the clusterer to the file.\n");
}
// Get scheme-specific options
if (clusterer instanceof OptionHandler) {
optionsText.append("\nOptions specific to "
+ clusterer.getClass().getName() + ":\n\n");
Enumeration enu = ((OptionHandler)clusterer).listOptions();
while (enu.hasMoreElements()) {
Option option = (Option)enu.nextElement();
optionsText.append(option.synopsis() + '\n');
optionsText.append(option.description() + "\n");
}
}
// Get global information (if available)
if (globalInfo) {
try {
String gi = getGlobalInfo(clusterer);
optionsText.append(gi);
} catch (Exception ex) {
// quietly ignore
}
}
return optionsText.toString();
}
/**
* Return the global info (if it exists) for the supplied clusterer
*
* @param clusterer the clusterer to get the global info for
* @return the global info (synopsis) for the clusterer
* @throws Exception if there is a problem reflecting on the clusterer
*/
protected static String getGlobalInfo(Clusterer clusterer) throws Exception {
BeanInfo bi = Introspector.getBeanInfo(clusterer.getClass());
MethodDescriptor[] methods;
methods = bi.getMethodDescriptors();
Object[] args = {};
String result = "\nSynopsis for " + clusterer.getClass().getName()
+ ":\n\n";
for (int i = 0; i < methods.length; i++) {
String name = methods[i].getDisplayName();
Method meth = methods[i].getMethod();
if (name.equals("globalInfo")) {
String globalInfo = (String)(meth.invoke(clusterer, args));
result += globalInfo;
break;
}
}
return result;
}
/**
* Tests whether the current evaluation object is equal to another
* evaluation object
*
* @param obj the object to compare against
* @return true if the two objects are equal
*/
public boolean equals(Object obj) {
if ((obj == null) || !(obj.getClass().equals(this.getClass())))
return false;
ClusterEvaluation cmp = (ClusterEvaluation) obj;
if ((m_classToCluster != null) != (cmp.m_classToCluster != null)) return false;
if (m_classToCluster != null) {
for (int i = 0; i < m_classToCluster.length; i++) {
if (m_classToCluster[i] != cmp.m_classToCluster[i])
return false;
}
}
if ((m_clusterAssignments != null) != (cmp.m_clusterAssignments != null)) return false;
if (m_clusterAssignments != null) {
for (int i = 0; i < m_clusterAssignments.length; i++) {
if (m_clusterAssignments[i] != cmp.m_clusterAssignments[i])
return false;
}
}
if (Double.isNaN(m_logL) != Double.isNaN(cmp.m_logL)) return false;
if (!Double.isNaN(m_logL)) {
if (m_logL != cmp.m_logL) return false;
}
if (m_numClusters != cmp.m_numClusters) return false;
// TODO: better comparison? via members?
String clusteringResults1 = m_clusteringResults.toString().replaceAll("Elapsed time.*", "");
String clusteringResults2 = cmp.m_clusteringResults.toString().replaceAll("Elapsed time.*", "");
if (!clusteringResults1.equals(clusteringResults2)) return false;
return true;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 7756 $");
}
/**
* Main method for testing this class.
*
* @param args the options
*/
public static void main (String[] args) {
try {
if (args.length == 0) {
throw new Exception("The first argument must be the name of a "
+ "clusterer");
}
String ClustererString = args[0];
args[0] = "";
Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null);
System.out.println(evaluateClusterer(newClusterer, args));
}
catch (Exception e) {
System.out.println(e.getMessage());
}
}
}