// compute Value Distance Metric matrices for nominal features
Map vdmMap = new HashMap();
Enumeration attrEnum = getInputFormat().enumerateAttributes();
while(attrEnum.hasMoreElements()) {
Attribute attr = (Attribute) attrEnum.nextElement();
if (!attr.equals(getInputFormat().classAttribute())) {
if (attr.isNominal() || attr.isString()) {
double[][] vdm = new double[attr.numValues()][attr.numValues()];
vdmMap.put(attr, vdm);
int[] featureValueCounts = new int[attr.numValues()];
int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr.numValues()];
instanceEnum = getInputFormat().enumerateInstances();
while(instanceEnum.hasMoreElements()) {
Instance instance = (Instance) instanceEnum.nextElement();
int value = (int) instance.value(attr);
int classValue = (int) instance.classValue();
featureValueCounts[value]++;
featureValueCountsByClass[classValue][value]++;
}
for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) {
for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) {
double sum = 0;
for (int classValueIndex = 0; classValueIndex < getInputFormat().numClasses(); classValueIndex++) {
double c1i = (double) featureValueCountsByClass[classValueIndex][valueIndex1];
double c2i = (double) featureValueCountsByClass[classValueIndex][valueIndex2];
double c1 = (double) featureValueCounts[valueIndex1];
double c2 = (double) featureValueCounts[valueIndex2];
double term1 = c1i / c1;
double term2 = c2i / c2;
sum += Math.abs(term1 - term2);
}
vdm[valueIndex1][valueIndex2] = sum;
}
}
}
}
}
// use this random source for all required randomness
Random rand = new Random(getRandomSeed());
// find the set of extra indices to use if the percentage is not evenly divisible by 100
List extraIndices = new LinkedList();
double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0);
int extraIndicesCount = (int) (percentageRemainder * sample.numInstances());
if (extraIndicesCount >= 1) {
for (int i = 0; i < sample.numInstances(); i++) {
extraIndices.add(i);
}
}
Collections.shuffle(extraIndices, rand);
extraIndices = extraIndices.subList(0, extraIndicesCount);
Set extraIndexSet = new HashSet(extraIndices);
// the main loop to handle computing nearest neighbors and generating SMOTE
// examples from each instance in the original minority class data
Instance[] nnArray = new Instance[nearestNeighbors];
for (int i = 0; i < sample.numInstances(); i++) {
Instance instanceI = sample.instance(i);
// find k nearest neighbors for each instance
List distanceToInstance = new LinkedList();
for (int j = 0; j < sample.numInstances(); j++) {
Instance instanceJ = sample.instance(j);
if (i != j) {
double distance = 0;
attrEnum = getInputFormat().enumerateAttributes();
while(attrEnum.hasMoreElements()) {
Attribute attr = (Attribute) attrEnum.nextElement();
if (!attr.equals(getInputFormat().classAttribute())) {
double iVal = instanceI.value(attr);
double jVal = instanceJ.value(attr);
if (attr.isNumeric()) {
distance += Math.pow(iVal - jVal, 2);
} else {
distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal];
}
}
}
distance = Math.pow(distance, .5);
distanceToInstance.add(new Object[] {distance, instanceJ});
}
}
// sort the neighbors according to distance
Collections.sort(distanceToInstance, new Comparator() {
public int compare(Object o1, Object o2) {
double distance1 = (Double) ((Object[]) o1)[0];
double distance2 = (Double) ((Object[]) o2)[0];
return (int) Math.ceil(distance1 - distance2);
}
});
// populate the actual nearest neighbor instance array
Iterator entryIterator = distanceToInstance.iterator();
int j = 0;
while(entryIterator.hasNext() && j < nearestNeighbors) {
nnArray[j] = (Instance) ((Object[])entryIterator.next())[1];
j++;
}
// create synthetic examples
int n = (int) Math.floor(getPercentage() / 100);
while(n > 0 || extraIndexSet.remove(i)) {
double[] values = new double[sample.numAttributes()];
int nn = rand.nextInt(nearestNeighbors);
attrEnum = getInputFormat().enumerateAttributes();
while(attrEnum.hasMoreElements()) {
Attribute attr = (Attribute) attrEnum.nextElement();
if (!attr.equals(getInputFormat().classAttribute())) {
if (attr.isNumeric()) {
double dif = nnArray[nn].value(attr) - instanceI.value(attr);
double gap = rand.nextDouble();
values[attr.index()] = (double) (instanceI.value(attr) + gap * dif);
} else if (attr.isDate()) {
double dif = nnArray[nn].value(attr) - instanceI.value(attr);
double gap = rand.nextDouble();
values[attr.index()] = (long) (instanceI.value(attr) + gap * dif);
} else {
int[] valueCounts = new int[attr.numValues()];
int iVal = (int) instanceI.value(attr);
valueCounts[iVal]++;
for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) {
int val = (int) nnArray[nnEx].value(attr);
valueCounts[val]++;
}
int maxIndex = 0;
int max = Integer.MIN_VALUE;
for (int index = 0; index < attr.numValues(); index++) {
if (valueCounts[index] > max) {
max = valueCounts[index];
maxIndex = index;
}
}
values[attr.index()] = maxIndex;
}
}
}
values[sample.classIndex()] = minIndex;
Instance synthetic = new Instance(1.0, values);