package edu.brown.markov;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import org.apache.commons.collections15.map.ListOrderedMap;
import org.apache.commons.collections15.set.ListOrderedSet;
import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONStringer;
import org.voltdb.VoltType;
import org.voltdb.catalog.Database;
import org.voltdb.utils.VoltTypeUtil;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import edu.brown.logging.LoggerUtil;
import edu.brown.logging.LoggerUtil.LoggerBoolean;
import edu.brown.markov.features.AbstractFeature;
import edu.brown.markov.features.BasePartitionFeature;
import edu.brown.markov.features.FeatureUtil;
import edu.brown.markov.features.TransactionIdFeature;
import edu.brown.statistics.HistogramUtil;
import edu.brown.statistics.ObjectHistogram;
import edu.brown.utils.ClassUtil;
import edu.brown.utils.CollectionUtil;
import edu.brown.utils.JSONSerializable;
import edu.brown.utils.JSONUtil;
import edu.brown.workload.TransactionTrace;
public class FeatureSet implements JSONSerializable {
private static final Logger LOG = Logger.getLogger(FeatureSet.class);
private final static LoggerBoolean debug = new LoggerBoolean();
private final static LoggerBoolean trace = new LoggerBoolean();
static {
LoggerUtil.attachObserver(LOG, debug, trace);
}
public enum Members {
TXN_VALUES,
ATTRIBUTES,
LAST_NUM_ATTRIBUTES,
ATTRIBUTE_HISTOGRAMS,
ATTRIBUTE_TYPES,
}
public enum Type {
NUMERIC,
STRING,
RANGE,
BOOLEAN,
}
/**
* The row values for each txn record
*/
public final Map<Long, Vector<Object>> txn_values = new ListOrderedMap<Long, Vector<Object>>();
/**
* The list of attributes that each txn should have
*/
public final ListOrderedMap<String, Type> attributes = new ListOrderedMap<String, Type>();
public int last_num_attributes = 0;
/**
* Attribute Value Histograms
*/
public final Map<String, ObjectHistogram> attribute_histograms = new HashMap<String, ObjectHistogram>();
/**
*
*/
public final Map<String, VoltType> attribute_types = new HashMap<String, VoltType>();
/**
* Constructor
*/
public FeatureSet() {
// Nothing for now...
}
/**
* Total number of attributes stored in this FeatureSet
*/
public int getAttributeCount() {
return (this.attributes.size());
}
/**
* Total number of transactions that we have extracted features for
*/
public int getTransactionCount() {
return (this.txn_values.size());
}
public List<String> getFeatures() {
return (this.attributes.asList());
}
public Type getFeatureType(String key) {
return (this.attributes.get(key));
}
public void addFeature(TransactionTrace txn, String key, Object val) {
this.addFeature(txn, key, val, null);
}
/**
* Returns true if the given key has been included in the set of attributes
* @param key
* @return
*/
public boolean hasFeature(String key) {
return (this.attributes.containsKey(key));
}
/**
* Return the index in the list of attributes for the given key
* @param key
* @return
*/
public Integer getFeatureIndex(String key) {
int idx = this.attributes.indexOf(key);
return (idx != -1 ? idx : null);
}
/**
* Return the indexes of all the features for the given prefix
* @param feature_class
* @return
*/
public Set<Integer> getFeatureIndexes(Class<? extends AbstractFeature> feature_class) {
Set<Integer> ret = new HashSet<Integer>();
String prefix = FeatureUtil.getFeatureKeyPrefix(feature_class);
for (int i = 0, cnt = this.attributes.size(); i < cnt; i++) {
String key = this.attributes.get(i);
if (key.startsWith(prefix)) ret.add(i);
} // FOR
return (ret);
}
public List<Object> getFeatureValues(Long txn_id) {
return (this.txn_values.get(txn_id));
}
public List<Object> getFeatureValues(TransactionTrace txn_trace) {
return (this.getFeatureValues(txn_trace.getTransactionId()));
}
@SuppressWarnings("unchecked")
public <T> T getFeatureValue(Long txn_id, String key) {
int idx = this.attributes.indexOf(key);
return ((T)this.txn_values.get(txn_id).get(idx));
}
@SuppressWarnings("unchecked")
public <T> T getFeatureValue(TransactionTrace txn_trace, String key) {
return ((T)this.getFeatureValue(txn_trace.getTransactionId(), key));
}
/**
*
* @param txn
* @param key
* @param val
* @param type
*/
public synchronized void addFeature(TransactionTrace txn, String key, Object val, Type type) {
long txn_id = txn.getTransactionId();
// Add the attribute if it's new
if (!this.attributes.containsKey(key)) {
// Figure out what type it is
if (type == null) {
Class<?> valClass = val.getClass();
if (valClass.equals(Boolean.class) || valClass.equals(boolean.class)) {
type = Type.BOOLEAN;
} else if (ClassUtil.getSuperClasses(valClass).contains(Number.class)) {
type = Type.NUMERIC;
} else if (val instanceof String) {
type = Type.STRING;
} else {
type = Type.RANGE;
}
}
if (debug.val) LOG.debug("Adding new attribute " + key + " [" + type + "]");
this.attributes.put(key, type);
this.attribute_histograms.put(key, new ObjectHistogram());
this.attribute_types.put(key, VoltType.NULL);
}
// HACK
if (val != null && (val.getClass().equals(int.class) || val.getClass().equals(Integer.class))) {
val = new Long((Integer)val);
}
// Always store the values in a histogram so we can normalize them later on
try {
this.attribute_histograms.get(key).put(val);
} catch (Exception ex) {
LOG.error("\n" + this.attribute_histograms.get(key));
LOG.error("Invalid value '" + val + "' for attribute '" + key + "'", ex);
System.exit(1);
}
// Now add the values into this txn's feature vector
int idx = this.attributes.indexOf(key);
int num_attributes = this.attributes.size();
Vector<Object> values = this.txn_values.get(txn_id);
if (values == null) {
if (trace.val) LOG.trace("Creating new feature vector for " + txn_id);
values = new Vector<Object>(num_attributes);
values.setSize(num_attributes);
this.txn_values.put(txn_id, values);
}
if (num_attributes != this.last_num_attributes) {
assert(num_attributes > this.last_num_attributes);
for (Vector<Object> v : this.txn_values.values()) {
v.setSize(num_attributes);
} // FOR
this.last_num_attributes = num_attributes;
if (trace.val) LOG.trace("Increased FeatureSet size to " + this.last_num_attributes + " attributes");
}
this.txn_values.get(txn_id).set(idx, val);
if (val != null && this.attribute_types.get(key) == VoltType.NULL) {
this.attribute_types.put(key, VoltType.typeFromClass(val.getClass()));
}
if (trace.val) LOG.trace(txn_id + ": " + key + " => " + val);
}
/**
* Export the FeatureSet to a Weka Instances
* @param name
* @return
*/
public Instances export(String name) {
return (this.export(name, false, this.attributes.keySet()));
}
public Instances export(String name, boolean normalize) {
return (this.export(name, normalize, this.attributes.keySet()));
}
/**
* Export this FeatureSet to a Weka Instances data set
* @param name
* @param normalize
* @param prefix_include
* @return
*/
@SuppressWarnings("unchecked")
public Instances export(String name, boolean normalize, Collection<String> prefix_include) {
// Figure out what attributes we want to export
Set<String> export_attrs = new ListOrderedSet<String>();
for (String key : this.attributes.keySet()) {
boolean include = false;
for (String prefix : prefix_include) {
if (key.startsWith(prefix)) {
include = true;
break;
}
} // FOR
if (include) export_attrs.add(key);
} // FOR
if (debug.val) LOG.debug("# of Attributes to Export: " + export_attrs.size());
List<Map<Object, Double>> normalized_values = null;
Set<String> normalize_ignore = new HashSet<String>();
if (normalize) {
normalize_ignore.add(FeatureUtil.getFeatureKeyPrefix(TransactionIdFeature.class));
normalize_ignore.add(FeatureUtil.getFeatureKeyPrefix(BasePartitionFeature.class));
if (debug.val) LOG.debug("Normalizing values!");
normalized_values = new ArrayList<Map<Object,Double>>();
for (String key : export_attrs) {
if (normalize_ignore.contains(key) == false) {
normalized_values.add(HistogramUtil.normalize(this.attribute_histograms.get(key)));
} else {
normalized_values.add(null);
}
} // FOR
}
// Attributes
FastVector attrs = new FastVector();
for (String key : export_attrs) {
Type type = this.attributes.get(key);
Attribute a = null;
boolean normalize_attr = (normalize && normalize_ignore.contains(key) == false);
// Normalized values will always just be numeric
if (normalize_attr) {
a = new Attribute(key);
// Otherwise we can play games with ranges and strings
} else {
switch (type) {
case RANGE:
case BOOLEAN: {
FastVector range_values = new FastVector();
for (Object v : this.attribute_histograms.get(key).values()) {
range_values.addElement(v.toString());
} // FOR
a = new Attribute(key, range_values);
break;
}
case STRING:
a = new Attribute(key, (FastVector)null);
break;
default:
a = new Attribute(key);
} // SWITCH
}
attrs.addElement(a);
} // FOR
assert(attrs.size() == export_attrs.size());
Instances data = new Instances(name, attrs, 0);
// Instance Values
for (Vector<Object> values : this.txn_values.values()) {
double instance[] = new double[data.numAttributes()];
int i = 0;
for (String key : export_attrs) {
int attr_idx = this.attributes.indexOf(key);
Object value = values.get(attr_idx);
Type type = this.attributes.getValue(attr_idx);
boolean normalize_attr = (normalize && normalize_ignore.contains(key) == false);
// Null => Missing Value Placeholder
if (value == null) {
instance[i] = Instance.missingValue();
// Normalized
} else if (normalize_attr) {
assert(normalized_values != null);
try {
instance[i] = normalized_values.get(i).get(value);
} catch (Exception ex) {
System.err.println(normalized_values.get(i));
LOG.fatal("Failed to get normalized value '" + value + "' for Attribute '" + key + "'");
throw new RuntimeException(ex);
}
// Actual Values
} else {
switch (type) {
case NUMERIC:
instance[i] = ((Number)value).doubleValue();
break;
case STRING:
instance[i] = data.attribute(i).addStringValue(value.toString());
break;
case BOOLEAN:
instance[i] = data.attribute(i).indexOfValue(Boolean.toString((Boolean)value));
break;
case RANGE:
instance[i] = data.attribute(i).indexOfValue(value.toString());
break;
default:
assert(false) : "Unexpected attribute type " + type;
} // SWITCH
}
i += 1;
} // FOR
data.add(new Instance(1.0, instance));
} // FOR
return (data);
}
// -----------------------------------------------------------------
// SERIALIZATION
// -----------------------------------------------------------------
@Override
public void load(File input_path, Database catalog_db) throws IOException {
JSONUtil.load(this, catalog_db, input_path);
}
@Override
public void save(File output_path) throws IOException {
JSONUtil.save(this, output_path);
}
@Override
public String toJSONString() {
return (JSONUtil.toJSONString(this));
}
@Override
public void toJSON(JSONStringer stringer) throws JSONException {
JSONUtil.fieldsToJSON(stringer, this, FeatureSet.class, FeatureSet.Members.values());
}
@Override
public void fromJSON(JSONObject json_object, Database catalog_db) throws JSONException {
// First deserialize all of the fields except for TXN_VALUES
Set<Members> fields = CollectionUtil.getAllExcluding(FeatureSet.Members.values(), FeatureSet.Members.TXN_VALUES);
JSONUtil.fieldsFromJSON(json_object, catalog_db, this, FeatureSet.class, fields.toArray(new Members[0]));
// Then we have reconstruct this mofo ourselves because the object types are implicit
JSONObject inner_obj = json_object.getJSONObject(Members.TXN_VALUES.name());
assert(inner_obj != null);
Iterator<String> it = inner_obj.keys();
while (it.hasNext()) {
String inner_key = it.next();
long txn_id = Long.valueOf(inner_key);
this.txn_values.put(txn_id, new Vector<Object>());
JSONArray inner_arr = inner_obj.getJSONArray(inner_key);
for (int i = 0, cnt = inner_arr.length(); i < cnt; i++) {
Object val = null;
if (inner_arr.isNull(i) == false) {
String json_val = inner_arr.getString(i);
String attr_key = this.attributes.get(i);
Type attr_type = this.attributes.getValue(i);
try {
switch (attr_type) {
case NUMERIC:
val = VoltTypeUtil.getObjectFromString(this.attribute_types.get(attr_key), json_val);
break;
case BOOLEAN:
val = Boolean.valueOf(json_val);
break;
case RANGE: {
// Get the value type from the histogram
ObjectHistogram h = this.attribute_histograms.get(attr_key);
assert(h != null);
VoltType volt_type = h.getEstimatedType();
assert(volt_type != VoltType.INVALID);
val = VoltTypeUtil.getObjectFromString(volt_type, json_val);
break;
}
case STRING:
val = json_val;
break;
default:
assert(false) : "Unexpected Type: " + attr_type;
} // SWITCH
} catch (Exception ex) {
LOG.fatal("Failed to deserialize TXN_VALUES-" + inner_key + "-" + i);
throw new JSONException(ex);
}
}
this.txn_values.get(txn_id).add(val);
} // FOR
} // WHILE
}
}