/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
@author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a>
package cc.mallet.pipe;
import java.util.ArrayList;
import java.util.logging.*;
import cc.mallet.classify.*;
import cc.mallet.classify.evaluate.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.*;
import cc.mallet.util.PropertyList;
/** Pipe features from underlying classifier to
* the confidence prediction instance list
public class Classification2ConfidencePredictingFeatureVector extends Pipe
public Classification2ConfidencePredictingFeatureVector ()
super (new Alphabet(), new LabelAlphabet());
public Instance pipe (Instance carrier)
Classification classification = (Classification) carrier.getData();
PropertyList features = null;
LabelVector lv = classification.getLabelVector();
Label bestLabel = lv.getBestLabel();
Instance inst = (Instance)classification.getInstance();
FeatureVector fv = (FeatureVector)inst.getData();
Alphabet fdict = fv.getAlphabet();
double winningThreshold = .990;
double varianceThreshold = .15;
double secondThreshold = .03;
double winningScore = lv.getValueAtRank(0);
double marginOfVictory = winningScore - lv.getValueAtRank(1);
// attempts to use the confusion matrix of the training list
// as some prior knowledge in training
features = PropertyList.add ("winningScore", winningScore, features);
features = PropertyList.add ("secondScore", lv.getValueAtRank(1), features);
for(int i=0; i<lv.numLocations(); i++) {
// features = PropertyList.add (lv.getLabelAtRank(i).toString() +"HasRank"+i, 1.0, features);
features = PropertyList.add (lv.getLabelAtRank(i).toString() +"HasValue", lv.valueAtLocation (i), features);
features = PropertyList.add ("MarginOfVictory", marginOfVictory, features);
features = PropertyList.add("numFeatures", ((double)fv.numLocations()/fdict.size()), features);
features = PropertyList.add (bestLabel.toString() + "IsFirst-" + lv.getLabelAtRank(1).toString()+"IsSecond", 1.0, features);
features = PropertyList.add ("Range", winningScore - lv.getValueAtRank(lv.numLocations()-1), features);
features = PropertyList.add (bestLabel.toString()+"IsFirst", 1.0, features);
features = PropertyList.add (lv.getLabelAtRank(1).toString() + "IsSecond", 1.0, features);
// loop through original feature vector
// and add each feature to PropertyList
// features = PropertyList.add ("winningScore", winningScore, features);
// features = PropertyList.add ("secondScore", lv.getValueAtRank(1), features);
// features = PropertyList.add (bestLabel.toString()+"IsFirst", 1.0, features);
// features = PropertyList.add (lv.getLabelAtRank(1).toString() + "IsSecond", 1.0, features);
// xxx this hurt performance. is this correct function call?
// for(int loc = 0; loc < fv.numLocations(); loc++)
// features = PropertyList.add(fdict.lookupObject(loc).toString(), 1.0, features);
//features = PropertyList.add ("winningClassPrecision", confusionMatrix.getPrecision(lv.getBestIndex()) , features);
// features = PropertyList.add ("confusionBetweenTop2", confusionMatrix.getConfusionBetween(lv.getBestIndex(), lv.getIndexAtRank(1)) , features);
//features = PropertyList.add ("Variance",getScoreVariance(lv), features);
// use cutoffs of some metrics
if(winningScore < winningThreshold){
features = PropertyList.add ("WinningScoreBelowX", 1.0, features);
if(classification.bestLabelIsCorrect()) {
if(marginOfVictory < .9)
features = PropertyList.add ("MarginOfVictoryBelow.9", 1.0, features);
if(getScoreVariance(lv) < varianceThreshold) {
features = PropertyList.add ("VarianceBelowX", 1.0, features);
if(lv.getValueAtRank(1) > secondThreshold) {
features = PropertyList.add ("SecondScoreAboveX", 1.0, features);
// all the confidence predicting features
features = PropertyList.add ("winningScore", winningScore, features);
features = PropertyList.add(bestLabel.toString()+"IsFirst", 1.0, features);
features = PropertyList.add (lv.getLabelAtRank(1).toString() + "IsSecond", 1.0, features);
features = PropertyList.add ("secondScore", lv.getValueAtRank(1), features);
for(int i=0; i<lv.numLocations(); i++) {
features = PropertyList.add (lv.getLabelAtRank(i).toString() +"HasRank"+i, lv.getValueAtRank(i), features);
if(marginOfVictory < .9)
features = PropertyList.add ("MarginOfVictoryBelow.9", 1.0, features);
if(winningScore < winningThreshold){
features = PropertyList.add ("WinningScoreBelowX", 1.0, features);
if(getScoreVariance(lv) < varianceThreshold) {
features = PropertyList.add ("VarianceBelowX", 1.0, features);
if(lv.getValueAtRank(1) > secondThreshold) {
features = PropertyList.add ("SecondScoreAboveX", 1.0, features);
LabelAlphabet vocab = lv.getLabelAlphabet();
for(int i=0; i<vocab.size(); i++) {
features = PropertyList.add(vocab.lookupObject(i).toString()+"'sScore", lv.valueAtLocation(i), features);
features = PropertyList.add("numFeatures", ((double)fv.numLocations()/fdict.size()), features);
features = PropertyList.add (bestLabel.toString() + "IsFirst-" + lv.getLabelAtRank(1).toString()+"IsSecond", 1.0, features);
features = PropertyList.add("marginOfVictory", lv.getBestValue() - lv.getValueAtRank(1), features);
// xxx these features either had 0 info gain or had a negative
// impact on performance
features = PropertyList.add ("scoreVariance", getScoreVariance(lv), features);
features = PropertyList.add ("scoreMean", getScoreMean(lv), features);
// loop through original feature vector
// and add each feature to PropertyList
// xxx this hurt performance. is this correct function call?
//for(int loc = 0; loc < fv.numLocations(); loc++)
// features = PropertyList.add(fdict.lookupObject(loc).toString(), 1.0, features);
// ...
// ...
carrier.setTarget(((LabelAlphabet)getTargetAlphabet()).lookupLabel(classification.bestLabelIsCorrect() ? "correct" : "incorrect"));
carrier.setData(new FeatureVector ((Alphabet) getDataAlphabet(), features, false));
return carrier;
private double getScoreMean(LabelVector lv)
double sum = 0.0;
for(int i=0; i<lv.numLocations(); i++) {
sum += lv.getValueAtRank(i);
return sum / lv.numLocations();
private double getScoreVariance(LabelVector lv)
double mean = getScoreMean(lv);
double squaredDifference = 0.0;
for(int i=0; i<lv.numLocations(); i++) {
squaredDifference += (mean - lv.getValueAtRank(i)) * (mean - lv.getValueAtRank(i));
return squaredDifference / lv.numLocations();