/**
* Copyright (c) 2004-2006 Regents of the University of California.
* See "license-prefuse.txt" for licensing terms.
*/
package prefuse.util;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import prefuse.data.Table;
import prefuse.data.Tuple;
import prefuse.data.column.ColumnMetadata;
import prefuse.data.tuple.TupleSet;
import prefuse.util.collections.DefaultLiteralComparator;
/**
* Functions for processing an iterator of tuples, including the creation
* of arrays of particular tuple data values and summary
* statistics (min, max, median, mean, standard deviation).
*
* @author <a href="http://jheer.org">jeffrey heer</a>
*/
public class DataLib {
/**
* Get an array containing all data values for a given tuple iteration
* and field.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return an array containing the data values
*/
public static Object[] toArray(Iterator tuples, String field) {
Object[] array = new Object[100];
int i=0;
for ( ; tuples.hasNext(); ++i ) {
if ( i >= array.length )
array = ArrayLib.resize(array, 3*array.length/2);
array[i] = ((Tuple)tuples.next()).get(field);
}
return ArrayLib.trim(array, i);
}
/**
* Get an array of doubles containing all column values for a given table
* and field. The {@link Table#canGetDouble(String)} method must return
* true for the given column name, otherwise an exception will be thrown.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return an array of doubles containing the column values
*/
public static double[] toDoubleArray(Iterator tuples, String field) {
double[] array = new double[100];
int i=0;
for ( ; tuples.hasNext(); ++i ) {
if ( i >= array.length )
array = ArrayLib.resize(array, 3*array.length/2);
array[i] = ((Tuple)tuples.next()).getDouble(field);
}
return ArrayLib.trim(array, i);
}
// ------------------------------------------------------------------------
/**
* Get a sorted array containing all column values for a given tuple
* iterator and field.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return an array containing the column values sorted
*/
public static Object[] ordinalArray(Iterator tuples, String field) {
return DataLib.ordinalArray(tuples, field,
DefaultLiteralComparator.getInstance());
}
/**
* Get a sorted array containing all column values for a given table and
* field.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return an array containing the column values sorted
*/
public static Object[] ordinalArray(Iterator tuples, String field,
Comparator cmp)
{
// get set of all unique values
HashSet set = new HashSet();
while ( tuples.hasNext() )
set.add(((Tuple)tuples.next()).get(field));
// sort the unique values
Object[] o = set.toArray();
Arrays.sort(o, cmp);
return o;
}
/**
* Get a sorted array containing all column values for a given tuple
* iterator and field.
* @param tuples a TupleSet
* @param field the column / data field name
* @return an array containing the column values sorted
*/
public static Object[] ordinalArray(TupleSet tuples, String field) {
return ordinalArray(tuples, field,
DefaultLiteralComparator.getInstance());
}
/**
* Get a sorted array containing all column values for a given table and
* field.
* @param tuples a TupleSet
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return an array containing the column values sorted
*/
public static Object[] ordinalArray(TupleSet tuples, String field,
Comparator cmp)
{
if ( tuples instanceof Table ) {
ColumnMetadata md = ((Table)tuples).getMetadata(field);
return md.getOrdinalArray();
} else {
return ordinalArray(tuples.tuples(), field, cmp);
}
}
// ------------------------------------------------------------------------
/**
* Get map mapping from column values (as Object instances) to their
* ordinal index in a sorted array.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return a map mapping column values to their position in a sorted
* order of values
*/
public static Map ordinalMap(Iterator tuples, String field) {
return ordinalMap(tuples, field,
DefaultLiteralComparator.getInstance());
}
/**
* Get map mapping from column values (as Object instances) to their
* ordinal index in a sorted array.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return a map mapping column values to their position in a sorted
* order of values
*/
public static Map ordinalMap(Iterator tuples, String field, Comparator cmp)
{
Object[] o = ordinalArray(tuples, field, cmp);
// map the values to the non-negative numbers
HashMap map = new HashMap();
for ( int i=0; i<o.length; ++i )
map.put(o[i], new Integer(i));
return map;
}
/**
* Get map mapping from column values (as Object instances) to their
* ordinal index in a sorted array.
* @param tuples a TupleSet
* @param field the column / data field name
* @return a map mapping column values to their position in a sorted
* order of values
*/
public static Map ordinalMap(TupleSet tuples, String field) {
return ordinalMap(tuples, field,
DefaultLiteralComparator.getInstance());
}
/**
* Get map mapping from column values (as Object instances) to their
* ordinal index in a sorted array.
* @param tuples a TupleSet
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return a map mapping column values to their position in a sorted
* order of values
*/
public static Map ordinalMap(TupleSet tuples, String field, Comparator cmp)
{
if ( tuples instanceof Table ) {
ColumnMetadata md = ((Table)tuples).getMetadata(field);
return md.getOrdinalMap();
} else {
return ordinalMap(tuples.tuples(), field, cmp);
}
}
// ------------------------------------------------------------------------
/**
* Get the number of values in a data column. Duplicates will be counted.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the number of values
*/
public static int count(Iterator tuples, String field) {
int i = 0;
for ( ; tuples.hasNext(); ++i, tuples.next() );
return i;
}
/**
* Get the number of distinct values in a data column.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the number of distinct values
*/
public static int uniqueCount(Iterator tuples, String field) {
HashSet set = new HashSet();
while ( tuples.hasNext() )
set.add(((Tuple)tuples.next()).get(field));
return set.size();
}
// ------------------------------------------------------------------------
/**
* Get the Tuple with the minimum data field value.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the Tuple with the minimum data field value
*/
public static Tuple min(Iterator tuples, String field) {
return min(tuples, field, DefaultLiteralComparator.getInstance());
}
/**
* Get the Tuple with the minimum data field value.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return the Tuple with the minimum data field value
*/
public static Tuple min(Iterator tuples, String field, Comparator cmp) {
Tuple t = null, tmp;
Object min = null;
if ( tuples.hasNext() ) {
t = (Tuple)tuples.next();
min = t.get(field);
}
while ( tuples.hasNext() ) {
tmp = (Tuple)tuples.next();
Object obj = tmp.get(field);
if ( cmp.compare(obj,min) < 0 ) {
t = tmp;
min = obj;
}
}
return t;
}
/**
* Get the Tuple with the minimum data field value.
* @param tuples a TupleSet
* @param field the column / data field name
* @return the Tuple with the minimum data field value
*/
public static Tuple min(TupleSet tuples, String field, Comparator cmp) {
if ( tuples instanceof Table ) {
Table table = (Table)tuples;
ColumnMetadata md = table.getMetadata(field);
return table.getTuple(md.getMinimumRow());
} else {
return min(tuples.tuples(), field, cmp);
}
}
/**
* Get the Tuple with the minimum data field value.
* @param tuples a TupleSet
* @param field the column / data field name
* @return the Tuple with the minimum data field value
*/
public static Tuple min(TupleSet tuples, String field) {
return min(tuples, field, DefaultLiteralComparator.getInstance());
}
// ------------------------------------------------------------------------
/**
* Get the Tuple with the maximum data field value.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the Tuple with the maximum data field value
*/
public static Tuple max(Iterator tuples, String field) {
return max(tuples, field, DefaultLiteralComparator.getInstance());
}
/**
* Get the Tuple with the maximum data field value.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return the Tuple with the maximum data field value
*/
public static Tuple max(Iterator tuples, String field, Comparator cmp) {
Tuple t = null, tmp;
Object min = null;
if ( tuples.hasNext() ) {
t = (Tuple)tuples.next();
min = t.get(field);
}
while ( tuples.hasNext() ) {
tmp = (Tuple)tuples.next();
Object obj = tmp.get(field);
if ( cmp.compare(obj,min) > 0 ) {
t = tmp;
min = obj;
}
}
return t;
}
/**
* Get the Tuple with the maximum data field value.
* @param tuples a TupleSet
* @param field the column / data field name
* @return the Tuple with the maximum data field value
*/
public static Tuple max(TupleSet tuples, String field, Comparator cmp) {
if ( tuples instanceof Table ) {
Table table = (Table)tuples;
ColumnMetadata md = table.getMetadata(field);
return table.getTuple(md.getMaximumRow());
} else {
return max(tuples.tuples(), field, cmp);
}
}
/**
* Get the Tuple with the maximum data field value.
* @param tuples a TupleSet
* @param field the column / data field name
* @return the Tuple with the maximum data field value
*/
public static Tuple max(TupleSet tuples, String field) {
return max(tuples, field, DefaultLiteralComparator.getInstance());
}
// ------------------------------------------------------------------------
/**
* Get the Tuple with the median data field value.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the Tuple with the median data field value
*/
public static Tuple median(Iterator tuples, String field) {
return median(tuples, field, DefaultLiteralComparator.getInstance());
}
/**
* Get the Tuple with the median data field value.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @param cmp a comparator for sorting the column contents
* @return the Tuple with the median data field value
*/
public static Tuple median(Iterator tuples, String field, Comparator cmp) {
Object[] t = new Tuple[100];
int i=0;
for ( ; tuples.hasNext(); ++i ) {
if ( i >= t.length )
t = ArrayLib.resize(t, 3*t.length/2);
t[i] = (Tuple)tuples.next();
}
ArrayLib.trim(t, i);
Object[] v = new Object[t.length];
int[] idx = new int[t.length];
for ( i=0; i<t.length; ++i ) {
idx[i] = i;
v[i] = ((Tuple)t[i]).get(field);
}
ArrayLib.sort(v, idx, cmp);
return (Tuple)t[idx[idx.length/2]];
}
/**
* Get the Tuple with the median data field value.
* @param tuples a TupleSet
* @param field the column / data field name
* @return the Tuple with the median data field value
*/
public static Tuple median(TupleSet tuples, String field, Comparator cmp) {
if ( tuples instanceof Table ) {
Table table = (Table)tuples;
ColumnMetadata md = table.getMetadata(field);
return table.getTuple(md.getMedianRow());
} else {
return median(tuples.tuples(), field, cmp);
}
}
/**
* Get the Tuple with the median data field value.
* @param tuples a TupleSet
* @param field the column / data field name
* @return the Tuple with the median data field value
*/
public static Tuple median(TupleSet tuples, String field) {
return median(tuples, field, DefaultLiteralComparator.getInstance());
}
// ------------------------------------------------------------------------
/**
* Get the mean value of a tuple data value. If any tuple does not have the
* named field or the field is not a numeric data type, NaN will be returned.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the mean value, or NaN if a non-numeric data type is encountered
*/
public static double mean(Iterator tuples, String field) {
try {
int count = 0;
double sum = 0;
while ( tuples.hasNext() ) {
sum += ((Tuple)tuples.next()).getDouble(field);
++count;
}
return sum/count;
} catch ( Exception e ) {
return Double.NaN;
}
}
/**
* Get the standard deviation of a tuple data value. If any tuple does not
* have the named field or the field is not a numeric data type, NaN will be
* returned.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the standard deviation value, or NaN if a non-numeric data type
* is encountered
*/
public static double deviation(Iterator tuples, String field) {
return deviation(tuples, field, DataLib.mean(tuples, field));
}
/**
* Get the standard deviation of a tuple data value. If any tuple does not
* have the named field or the field is not a numeric data type, NaN will be
* returned.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @param mean the mean of the column, used to speed up accurate
* deviation calculation
* @return the standard deviation value, or NaN if a non-numeric data type
* is encountered
*/
public static double deviation(Iterator tuples, String field, double mean) {
try {
int count = 0;
double sumsq = 0;
double x;
while ( tuples.hasNext() ) {
x = ((Tuple)tuples.next()).getDouble(field) - mean;
sumsq += x*x;
++count;
}
return Math.sqrt(sumsq/count);
} catch ( Exception e ) {
return Double.NaN;
}
}
/**
* Get the sum of a tuple data value. If any tuple does not have the named
* field or the field is not a numeric data type, NaN will be returned.
* @param tuples an iterator over tuples
* @param field the column / data field name
* @return the sum, or NaN if a non-numeric data type is encountered
*/
public static double sum(Iterator tuples, String field) {
try {
double sum = 0;
while ( tuples.hasNext() ) {
sum += ((Tuple)tuples.next()).getDouble(field);
}
return sum;
} catch ( Exception e ) {
return Double.NaN;
}
}
// ------------------------------------------------------------------------
/**
* Infer the data field type across all tuples in a TupleSet.
* @param tuples the TupleSet to analyze
* @param field the data field to type check
* @return the inferred data type
* @throws IllegalArgumentException if incompatible types are used
*/
public static Class inferType(TupleSet tuples, String field) {
if ( tuples instanceof Table ) {
return ((Table)tuples).getColumnType(field);
} else {
Class type = null, type2 = null;
Iterator iter = tuples.tuples();
while ( iter.hasNext() ) {
Tuple t = (Tuple)iter.next();
if ( type == null ) {
type = t.getColumnType(field);
} else if ( !type.equals(type2=t.getColumnType(field)) ) {
if ( type2.isAssignableFrom(type) ) {
type = type2;
} else if ( !type.isAssignableFrom(type2) ) {
throw new IllegalArgumentException(
"The data field ["+field+"] does not have " +
"a consistent type across provided Tuples");
}
}
}
return type;
}
}
} // end of class DataLib