package de.lmu.ifi.dbs.elki.evaluation.paircounting;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.Collection;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairGeneratorMerge;
import de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairGeneratorNoise;
import de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairGeneratorSingleCluster;
import de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairSortedGeneratorInterface;
import de.lmu.ifi.dbs.elki.utilities.pairs.Triple;
/**
* Compare two clustering results using a pair-counting F-Measure.
*
* A pair are any two objects that belong to the same cluster.
*
* Two clusterings are compared by comparing their pairs; if two clusterings
* completely agree, they also agree on every pair; even when the clusters and
* points are ordered differently.
*
* An empty clustering will of course have no pairs, the trivial all-in-one
* clustering of course has n^2 pairs. Therefore neither recall nor precision
* itself are useful, however their combination -- the F-Measure -- is useful.
*
* @author Erich Schubert
*
* @apiviz.uses de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairSortedGeneratorInterface
* @apiviz.uses de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairGeneratorNoise
* @apiviz.uses de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairGeneratorSingleCluster
* @apiviz.uses de.lmu.ifi.dbs.elki.evaluation.paircounting.generator.PairGeneratorMerge
*/
public class PairCountingFMeasure {
/**
* Get a pair generator for the given Clustering
*
* @param <R> Clustering result class
* @param <M> Model type
* @param clusters Clustering result
* @param noiseSpecial Special handling for "noise clusters"
* @param hierarchicalSpecial Special handling for hierarchical clusters
* @return Sorted pair generator
*/
public static <R extends Clustering<M>, M extends Model> PairSortedGeneratorInterface getPairGenerator(R clusters, boolean noiseSpecial, boolean hierarchicalSpecial) {
// collect all clusters into a flat list.
Collection<Cluster<M>> allclusters = clusters.getAllClusters();
// Make generators for each cluster
PairSortedGeneratorInterface[] gens = new PairSortedGeneratorInterface[allclusters.size()];
int i = 0;
for(Cluster<?> c : allclusters) {
if(noiseSpecial && c.isNoise()) {
gens[i] = new PairGeneratorNoise(c);
}
else {
gens[i] = new PairGeneratorSingleCluster(c, hierarchicalSpecial);
}
i++;
}
return new PairGeneratorMerge(gens);
}
/**
* Compare two clustering results.
*
* @param <R> Result type
* @param <M> Model type
* @param <S> Result type
* @param <N> Model type
* @param result1 first result
* @param result2 second result
* @param beta Beta value for the F-Measure
* @param noiseSpecial Noise receives special treatment
* @param hierarchicalSpecial Special handling for hierarchical clusters
* @return Pair counting F-Measure result.
*/
public static <R extends Clustering<M>, M extends Model, S extends Clustering<N>, N extends Model> double compareClusterings(R result1, S result2, double beta, boolean noiseSpecial, boolean hierarchicalSpecial) {
PairSortedGeneratorInterface first = getPairGenerator(result1, noiseSpecial, hierarchicalSpecial);
PairSortedGeneratorInterface second = getPairGenerator(result2, noiseSpecial, hierarchicalSpecial);
Triple<Integer, Integer, Integer> countedPairs = countPairs(first, second);
return fMeasure(countedPairs.first, countedPairs.second, countedPairs.third, beta);
}
/**
* Compare two clustering results.
*
* @param <R> Result type
* @param <M> Model type
* @param <S> Result type
* @param <N> Model type
* @param result1 first result
* @param result2 second result
* @param beta Beta value for the F-Measure
* @return Pair counting F-Measure result.
*/
public static <R extends Clustering<M>, M extends Model, S extends Clustering<N>, N extends Model> double compareClusterings(R result1, S result2, double beta) {
return compareClusterings(result1, result2, beta, false, false);
}
/**
* Compare two clustering results.
*
* @param <R> Result type
* @param <M> Model type
* @param <S> Result type
* @param <N> Model type
* @param result1 first result
* @param result2 second result
* @param noiseSpecial Noise receives special treatment
* @return Pair counting F-1-Measure result.
*/
public static <R extends Clustering<M>, M extends Model, S extends Clustering<N>, N extends Model> double compareClusterings(R result1, S result2, boolean noiseSpecial, boolean hierarchicalSpecial) {
return compareClusterings(result1, result2, 1.0, noiseSpecial, hierarchicalSpecial);
}
/**
* Compare two clustering results.
*
* @param <R> Result type
* @param <M> Model type
* @param <S> Result type
* @param <N> Model type
* @param result1 first result
* @param result2 second result
* @return Pair counting F-1-Measure result.
*/
public static <R extends Clustering<M>, M extends Model, S extends Clustering<N>, N extends Model> double compareClusterings(R result1, S result2) {
return compareClusterings(result1, result2, 1.0, false, false);
}
/**
* Compare two sets of generated pairs. It determines how many objects of the
* first set are in both sets, just in the first set or just in the second
* set.</p>
*
*
* @param <R> Result type
* @param <M> Model type
* @param <S> Result type
* @param <N> Model type
* @param result1 first result
* @param result2 second result
* @return Returns a {@link Triple} that contains the number of objects that
* are in both sets (FIRST), the number of objects that are just in
* the first set (SECOND) and the number of object that are just in
* the second set (THIRD).
*
*/
public static <R extends Clustering<M>, M extends Model, S extends Clustering<N>, N extends Model> Triple<Integer, Integer, Integer> countPairs(R result1, S result2) {
PairSortedGeneratorInterface first = getPairGenerator(result1, false, false);
PairSortedGeneratorInterface second = getPairGenerator(result2, false, false);
return countPairs(first, second);
}
/**
* Compare two sets of generated pairs. It determines how many objects of the
* first set are in both sets, just in the first set or just in the second
* set.</p>
*
* @param first first set
* @param second second set
* @return Returns a {@link Triple} that contains the number of objects that
* are in both sets (FIRST), the number of objects that are just in
* the first set (SECOND) and the number of object that are just in
* the second set (THIRD).
*/
public static Triple<Integer, Integer, Integer> countPairs(PairSortedGeneratorInterface first, PairSortedGeneratorInterface second) {
int inboth = 0;
int infirst = 0;
int insecond = 0;
while(first.current() != null && second.current() != null) {
int cmp = first.current().compareTo(second.current());
if(cmp == 0) {
inboth++;
first.next();
second.next();
}
else if(cmp < 0) {
infirst++;
first.next();
}
else {
insecond++;
second.next();
}
}
while(first.current() != null) {
infirst++;
first.next();
}
while(second.current() != null) {
insecond++;
second.next();
}
return new Triple<Integer, Integer, Integer>(inboth, infirst, insecond);
}
/**
* Computes the F-measure of the given parameters.</p>
* <p>
* Returns
* <code>((1+beta*beta) * inBoth) / ((1+beta*beta) * inBoth + (beta*beta)*inFirst + inSecond)</code>
* </p>
*
* @param inBoth The number of objects that are in both sets.
* @param inFirst The number of objects that are in the first set.
* @param inSecond The number of objects that are in the second set.
* @param beta The beta values for the f-measure.
* @return The F-measure.
*/
public static double fMeasure(int inBoth, int inFirst, int inSecond, double beta) {
// System.out.println("Both: "+inboth+" First: "+infirst+" Second: "+insecond);
double fmeasure = ((1 + beta * beta) * inBoth) / ((1 + beta * beta) * inBoth + (beta * beta) * inFirst + inSecond);
return fmeasure;
}
}