package cc.mallet.util;
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by For further
information, see the file `LICENSE' included with this distribution. */
* Class of static methods for calculating statistics of a SparseVector sample
* packaged in an InstanceList.
* @author Jerod Weinman <A HREF=""></A>
import java.util.Arrays;
import java.util.Iterator;
import cc.mallet.types.*;
import gnu.trove.TIntHashSet;
public class VectorStats {
* Returns a <CODE>SparseVector</CODE> whose entries (taken from the union of
* those in the instances) are the expected values of those in the
* <CODE>InstanceList</CODE>. This implies the returned vector will not have
* binary values.
public static SparseVector mean(InstanceList instances) {
if (instances == null || instances.size() == 0)
return null;
Iterator<Instance> instanceItr = instances.iterator();
SparseVector v;
Instance instance;
int indices[];
int maxSparseIndex = -1;
int maxDenseIndex = -1;
// First, we find the union of all the indices used in the instances
TIntHashSet hIndices = new TIntHashSet(instances.getDataAlphabet().size());
while (instanceItr.hasNext()) {
instance = (Instance);
v = (SparseVector) (instance.getData());
indices = v.getIndices();
if (indices != null) {
if (indices[indices.length - 1] > maxSparseIndex)
maxSparseIndex = indices[indices.length - 1];
} else // dense
if (v.numLocations() > maxDenseIndex)
maxDenseIndex = v.numLocations() - 1;
if (maxDenseIndex > -1) // dense vectors were present
if (maxSparseIndex > maxDenseIndex)
// sparse vectors were present and they had greater indices than
// the dense vectors
// therefore, we create sparse vectors and
// add all the dense indices
for (int i = 0; i <= maxDenseIndex; i++)
} else
// sparse indices may have been present, but we don't care
// since they never had indices that exceeded those of the
// dense vectors
return mean(instances, maxDenseIndex + 1);
// reaching this statement implies we can create a sparse vector
return mean(instances, hIndices.toArray());
* Returns a <CODE>SparseVector</CODE> whose entries (dense with the given
* number of indices) are the expected values of those in the
* <CODE>InstanceList</CODE>. This implies the returned vector will not have
* binary values.
public static SparseVector mean(InstanceList instances, int numIndices) {
SparseVector mv = new SparseVector(new double[numIndices], false);
return mean(instances, mv);
* Returns a <CODE>SparseVector</CODE> whose entries (the given indices) are
* the expected values of those in the <CODE>InstanceList</CODE>. This implies
* the returned vector will not have binary values.
public static SparseVector mean(InstanceList instances, int[] indices) {
// Create the mean vector with the indices having all zeros,
// nothing copied, sorted, and no checks for duplicates.
// it is faster to sort indices first
SparseVector mv = new SparseVector(indices, new double[indices.length],
// it is faster to sort indices first (above)
// false, true, false);
false, false, false);
return mean(instances, mv);
private static SparseVector mean(InstanceList instances,
SparseVector meanVector) {
if (instances == null || instances.size() == 0)
return null;
Instance instance;
SparseVector v;
Iterator<Instance> instanceItr = instances.iterator();
double factor = 1.0 / (double) instances.size();
while (instanceItr.hasNext()) {
instance = (Instance);
v = (SparseVector) (instance.getData());
meanVector.plusEqualsSparse(v, factor);
return meanVector;
* Returns a <CODE>SparseVector</CODE> whose entries (taken from the union of
* those in the instances) are the variance of those in the
* <CODE>InstanceList</CODE>. This implies the returned vector will not have
* binary values.
* @param unbiased
* Normalizes by N-1 when true, and by N otherwise.
public static SparseVector variance(InstanceList instances, boolean unbiased) {
return variance(instances, mean(instances), unbiased);
* Returns a <CODE>SparseVector</CODE> whose entries (taken from the mean
* argument) are the variance of those in the <CODE>InstanceList</CODE>. This
* implies the returned vector will not have binary values.
* @param unbiased
* Normalizes by N-1 when true, and by N otherwise.
public static SparseVector variance(InstanceList instances,
SparseVector mean, boolean unbiased)
if (instances == null || instances.size() == 0)
return null;
double factor = 1.0 / (double) (instances.size() - (unbiased ? 1.0 : 0.0));
System.out.println("factor = " + factor);
SparseVector v;
// var = (x^2 - n*mu^2)/(n-1)
SparseVector vv = (SparseVector) mean.cloneMatrix();
vv.timesEqualsSparse(vv, -(double) instances.size() * factor);
Iterator<Instance> instanceItr = instances.iterator();
Instance instance;
while (instanceItr.hasNext()) {
instance = (Instance);
v = (SparseVector) ((SparseVector) (instance.getData())).cloneMatrix();
vv.plusEqualsSparse(v, factor);
System.out.println("Var:\n" + vv);
return vv;
/** Returns unbiased variance */
public static SparseVector variance(InstanceList instances) {
return variance(instances, true);
/** Returns unbiased variance of instances having the given mean. */
public static SparseVector variance(InstanceList instances, SparseVector mean) {
return variance(instances, mean, true);
* Square root of variance.
* @param mean
* Mean of the given instances.
* @param unbiased
* Normalizes variance by N-1 when true, and by N otherwise.
* @see variance
public static SparseVector stddev(InstanceList instances, SparseVector mean,
boolean unbiased) {
if (instances.size() == 0)
return null;
SparseVector sv = variance(instances, mean, unbiased);
int dim = sv.numLocations();
double val;
for (int i = 0; i < dim; i++) {
val = sv.valueAtLocation(i);
sv.setValueAtLocation(i, Math.sqrt(val));
return sv;
/** Square root of unbiased variance. */
public static SparseVector stddev(InstanceList instances) {
return stddev(instances, true);
* Square root of variance.
* @param unbiased
* Normalizes variance by N-1 when true, and by N otherwise.
* @see variance
public static SparseVector stddev(InstanceList instances, boolean unbiased) {
return stddev(instances, mean(instances), unbiased);
/** Square root of unbiased variance of instances having the given mean */
public static SparseVector stddev(InstanceList instances, SparseVector mean) {
return stddev(instances, mean, true);