continue;
}
final NumberVector<?, ?> vec = relation.get(id);
TiedTopBoundedHeap<DoubleIntPair> heap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder());
for(int i = 0; i < dim; i++) {
heap.add(new DoubleIntPair(vec.doubleValue(i + 1), i));
}
if(heap.size() >= 2 * estimated_outliers) {
logger.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size());
}
for(DoubleIntPair pair : heap) {
if(outliers_seen[pair.second] == 0) {
outliers_seen[pair.second] = 1;
union_outliers += 1;
}
else {
outliers_seen[pair.second] += 1;
}
}
}
}
logger.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers");
// Build the final weight vector.
final double[] estimated_weights = new double[dim];
final double[] estimated_truth = new double[dim];
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
NumberVector<?, ?> estimated_truth_vec = refvec.newNumberVector(estimated_truth);
PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> wdist = getDistanceFunction(estimated_weights);
PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> tdist = wdist;
// Build the naive ensemble:
final double[] naiveensemble = new double[dim];
{
for(DBID id : relation.iterDBIDs()) {
if(firstid.equals(id)) {
continue;
}
final NumberVector<?, ?> vec = relation.get(id);
for(int d = 0; d < dim; d++) {
naiveensemble[d] += vec.doubleValue(d + 1);
}
}
for(int d = 0; d < dim; d++) {
naiveensemble[d] /= (relation.size() - 1);
}
}
NumberVector<?, ?> naivevec = refvec.newNumberVector(naiveensemble);
// Compute single AUC scores and estimations.
// Remember the method most similar to the estimation
double bestauc = 0.0;
String bestaucstr = "";
double bestcost = Double.POSITIVE_INFINITY;
String bestcoststr = "";
DBID bestid = null;
double bestest = Double.POSITIVE_INFINITY;
{
// Compute individual scores
for(DBID id : relation.iterDBIDs()) {
if(firstid.equals(id)) {
continue;
}
// fout.append(labels.get(id));
final NumberVector<?, ?> vec = relation.get(id);
double auc = computeROCAUC(vec, positive, dim);
double estimated = wdist.doubleDistance(vec, estimated_truth_vec);
double cost = tdist.doubleDistance(vec, refvec);
logger.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(id));
if(auc > bestauc) {
bestauc = auc;
bestaucstr = labels.get(id);
}
if(cost < bestcost) {
bestcost = cost;
bestcoststr = labels.get(id);
}
if(estimated < bestest) {
bestest = estimated;
bestid = id;
}
}
}
// Initialize ensemble with "best" method
logger.verbose("Distance function: " + wdist);
logger.verbose("Initial estimation of outliers: " + union_outliers);
logger.verbose("Initializing ensemble with: " + labels.get(bestid));
ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
enscands.remove(bestid);
enscands.remove(firstid);
final double[] greedyensemble = new double[dim];
{
final NumberVector<?, ?> vec = relation.get(bestid);
for(int i = 0; i < dim; i++) {
greedyensemble[i] = vec.doubleValue(i + 1);
}
}
// Greedily grow the ensemble
final double[] testensemble = new double[dim];
while(enscands.size() > 0) {
NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble);
// Weighting factors for combining:
double s1 = ensemble.size() / (ensemble.size() + 1.);
double s2 = 1. / (ensemble.size() + 1.);
final int heapsize = enscands.size();
TopBoundedHeap<DoubleObjPair<DBID>> heap = new TopBoundedHeap<DoubleObjPair<DBID>>(heapsize, Collections.reverseOrder());
for(DBID id : enscands) {
final NumberVector<?, ?> vec = relation.get(id);
double diversity = wdist.doubleDistance(vec, greedyvec);
heap.add(new DoubleObjPair<DBID>(diversity, id));
}
while(heap.size() > 0) {
DBID bestadd = heap.poll().second;
enscands.remove(bestadd);
// Update ensemble:
final NumberVector<?, ?> vec = relation.get(bestadd);
for(int i = 0; i < dim; i++) {
testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i + 1) * s2;
}
NumberVector<?, ?> testvec = refvec.newNumberVector(testensemble);
double oldd = wdist.doubleDistance(estimated_truth_vec, greedyvec);
double newd = wdist.doubleDistance(estimated_truth_vec, testvec);
// logger.verbose("Distances: " + oldd + " vs. " + newd);
if(newd < oldd) {
System.arraycopy(testensemble, 0, greedyensemble, 0, dim);
ensemble.add(bestadd);
// logger.verbose("Growing ensemble with: " + labels.get(bestadd));
break; // Recompute heap
}
else {
// logger.verbose("Discarding: " + labels.get(bestadd));
if(refine_truth) {
boolean refresh = false;
// Update target vectors and weights
TiedTopBoundedHeap<DoubleIntPair> oheap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder());
for(int i = 0; i < dim; i++) {
oheap.add(new DoubleIntPair(vec.doubleValue(i + 1), i));
}
for(DoubleIntPair pair : oheap) {
assert (outliers_seen[pair.second] > 0);
outliers_seen[pair.second] -= 1;
if(outliers_seen[pair.second] == 0) {