InstanceList oldInstances = clustering.getInstances();
Alphabet alph = oldInstances.getDataAlphabet();
LabelAlphabet lalph = (LabelAlphabet) oldInstances.getTargetAlphabet();
if (alph == null) alph = new Alphabet();
if (lalph == null) lalph = new LabelAlphabet();
Pipe noop = new Noop(alph, lalph);
InstanceList newInstances = new InstanceList(noop);
for (int j = 0; j < oldInstances.size(); j++) {
int label = clustering.getLabel(j);
Instance instance = oldInstances.get(j);
if (clustering.size(label) >= minClusterSize.value)
newInstances.add(noop.pipe(new Instance(instance.getData(), lalph.lookupLabel(new Integer(label)), instance.getName(), instance.getSource())));
}
clusterings.set(i, createSmallerClustering(newInstances));
}
if (outputPrefixFile.value != null) {
try {
ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(outputPrefixFile.value));
oos.writeObject(clusterings);
oos.close();
} catch (Exception e) {
logger.warning("Exception writing clustering to file " + outputPrefixFile.value + " " + e);
e.printStackTrace();
}
}
}
// Split into training/testing
if (trainingProportion.value > 0) {
if (clusterings.size() > 1)
throw new IllegalArgumentException("Expect one clustering to do train/test split, not " + clusterings.size());
Clustering clustering = clusterings.get(0);
int targetTrainSize = (int)(trainingProportion.value * clustering.getNumInstances());
TIntHashSet clustersSampled = new TIntHashSet();
Randoms random = new Randoms(123);
LabelAlphabet lalph = new LabelAlphabet();
InstanceList trainingInstances = new InstanceList(new Noop(null, lalph));
while (trainingInstances.size() < targetTrainSize) {
int cluster = random.nextInt(clustering.getNumClusters());
if (!clustersSampled.contains(cluster)) {
clustersSampled.add(cluster);
InstanceList instances = clustering.getCluster(cluster);