if (trainingProportion.value > 0) {
if (clusterings.size() > 1)
throw new IllegalArgumentException("Expect one clustering to do train/test split, not " + clusterings.size());
Clustering clustering = clusterings.get(0);
int targetTrainSize = (int)(trainingProportion.value * clustering.getNumInstances());
TIntHashSet clustersSampled = new TIntHashSet();
Randoms random = new Randoms(123);
LabelAlphabet lalph = new LabelAlphabet();
InstanceList trainingInstances = new InstanceList(new Noop(null, lalph));
while (trainingInstances.size() < targetTrainSize) {
int cluster = random.nextInt(clustering.getNumClusters());
if (!clustersSampled.contains(cluster)) {
clustersSampled.add(cluster);
InstanceList instances = clustering.getCluster(cluster);
for (int i = 0; i < instances.size(); i++) {
Instance inst = instances.get(i);
trainingInstances.add(new Instance(inst.getData(), lalph.lookupLabel(new Integer(cluster)), inst.getName(), inst.getSource()));
}
}
}
trainingInstances.shuffle(random);
Clustering trainingClustering = createSmallerClustering(trainingInstances);
InstanceList testingInstances = new InstanceList(null, lalph);
for (int i = 0; i < clustering.getNumClusters(); i++) {
if (!clustersSampled.contains(i)) {
InstanceList instances = clustering.getCluster(i);
for (int j = 0; j < instances.size(); j++) {
Instance inst = instances.get(j);
testingInstances.add(new Instance(inst.getData(), lalph.lookupLabel(new Integer(i)), inst.getName(), inst.getSource()));
}