package edu.brown.markov;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.collections15.map.ListOrderedMap;
import org.apache.commons.collections15.set.ListOrderedSet;
import org.apache.commons.pool.BasePoolableObjectFactory;
import org.apache.log4j.Logger;
import org.voltdb.CatalogContext;
import org.voltdb.catalog.Procedure;
import weka.classifiers.Classifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.clusterers.AbstractClusterer;
import weka.clusterers.EM;
import weka.clusterers.FilteredClusterer;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.NumericToNominal;
import edu.brown.catalog.CatalogUtil;
import edu.brown.costmodel.MarkovCostModel;
import edu.brown.hstore.estimators.markov.MarkovEstimator;
import edu.brown.logging.LoggerUtil;
import edu.brown.logging.LoggerUtil.LoggerBoolean;
import edu.brown.markov.containers.MarkovGraphsContainerUtil;
import edu.brown.markov.containers.MarkovGraphsContainer;
import edu.brown.markov.features.BasePartitionFeature;
import edu.brown.markov.features.FeatureUtil;
import edu.brown.pools.FastObjectPool;
import edu.brown.pools.Poolable;
import edu.brown.statistics.ObjectHistogram;
import edu.brown.utils.ArgumentsParser;
import edu.brown.utils.CollectionUtil;
import edu.brown.utils.FileUtil;
import edu.brown.utils.PartitionEstimator;
import edu.brown.utils.PartitionSet;
import edu.brown.utils.StringUtil;
import edu.brown.utils.UniqueCombinationIterator;
import edu.brown.workload.TransactionTrace;
import edu.brown.workload.Workload;
/**
*
* @author pavlo
*/
public class FeatureClusterer {
private static final Logger LOG = Logger.getLogger(FeatureClusterer.class);
private static final LoggerBoolean debug = new LoggerBoolean();
private static final LoggerBoolean trace = new LoggerBoolean();
static {
LoggerUtil.attachObserver(LOG, debug, trace);
}
public enum SplitType {
/** Training Workload Percentage */
TRAINING (0.40),
/** Validation Workload Percentage */
VALIDATION (0.60),
/** Testing Workload Percentage */
TESTING (0.00);
private final double percentage;
private SplitType(double percentage) {
this.percentage = percentage;
}
public double getPercentage() {
return this.percentage;
}
}
// ----------------------------------------------------------------------------
// DEFAULT CONFIGURATION VALUES
// ----------------------------------------------------------------------------
/** For each search round, we will only propagate the attributes found this these top-k AttibuteSets */
private static final double DEFAULT_ATTRIBUTESET_TOP_K = 0.10;
/** Number of threads to use per thread pool */
private static final int DEFAULT_NUM_THREADS = 2;
/** Number of search rounds in findBestMarkovAttributeSet */
private static final int DEFAULT_NUM_ROUNDS = 10;
// ----------------------------------------------------------------------------
// MARKOVGRAPHSCONTAINER WRAPPER
// ----------------------------------------------------------------------------
private static class TxnToClusterMarkovGraphsContainer extends MarkovGraphsContainer {
/**
* Hackish cross-reference table to go from the TransactionId to Cluster#
*/
private final Map<Long, Integer> txnid_cluster_xref = new HashMap<Long, Integer>();
@Override
public MarkovGraph getFromParams(Long txn_id, int base_partition, Object[] params, Procedure catalog_proc) {
// Look-up what cluster our TransactionTrace belongs to
Integer cluster = this.txnid_cluster_xref.get(txn_id);
assert(cluster != null) : "Failed to initialize TransactionId->Cluster# xref for txn #" + txn_id;
return this.get(cluster, catalog_proc);
}
/**
* Map a TransactionId to a ClusterId
* @param txn_id
* @param cluster_id
*/
public void addTransactionClusterXref(long txn_id, int cluster_id) {
this.txnid_cluster_xref.put(txn_id, cluster_id);
}
@Override
public void clear() {
super.clear();
this.txnid_cluster_xref.clear();
}
}
// ----------------------------------------------------------------------------
// EXECUTION STATE
// ----------------------------------------------------------------------------
/**
* ExecutionState Factory
*/
private static class ExecutionStateFactory extends BasePoolableObjectFactory {
private final FeatureClusterer fclusterer;
public ExecutionStateFactory(FeatureClusterer fclusterer) {
this.fclusterer = fclusterer;
}
@Override
public Object makeObject() throws Exception {
return this.fclusterer.new ExecutionState();
}
@Override
public void passivateObject(Object obj) throws Exception {
ExecutionState state = (ExecutionState)obj;
state.finish();
}
} // END CLASS
/**
*
*/
private class ExecutionState implements Poolable {
/**
* Current Clusterer for this ExecutionState
*/
AbstractClusterer clusterer;
/**
* Set of all the ClusterIds that we have seen
*/
final Set<Integer> cluster_ids = new HashSet<Integer>();
/**
* We want to always split the MarkovGraphContainers by base partition, since we already know
* that this is going to be the best predictor
*/
final TxnToClusterMarkovGraphsContainer markovs_per_partition[];
/**
* Then we have a costmodel for each PartitionId
*/
final MarkovCostModel costmodels_per_partition[];
/**
* And a TransactionEstimator for each PartitionId
*/
final MarkovEstimator t_estimators_per_partition[];
/**
* Histogram of Clusters Per Partition
*/
final ObjectHistogram<Integer> clusters_per_partition[];
int c_counters[] = new int[] {
0, // Single-P
0, // Multi-P
0, // Known Clusters
};
int t_counters[] = new int[] {
0, // Single-P
0, // Multi-P
0, // Total # of Txns
};
/**
* Constructor
*/
@SuppressWarnings("unchecked")
private ExecutionState() {
// We allocate a complete array for all of the partitions in the catalog
this.markovs_per_partition = new TxnToClusterMarkovGraphsContainer[FeatureClusterer.this.total_num_partitions];
this.costmodels_per_partition = new MarkovCostModel[FeatureClusterer.this.total_num_partitions];
this.t_estimators_per_partition = new MarkovEstimator[FeatureClusterer.this.total_num_partitions];
this.clusters_per_partition = (ObjectHistogram<Integer>[])new ObjectHistogram<?>[FeatureClusterer.this.total_num_partitions];
// But then only initialize the partition-specific data structures
for (int p : FeatureClusterer.this.all_partitions) {
this.clusters_per_partition[p] = new ObjectHistogram<Integer>();
this.markovs_per_partition[p] = new TxnToClusterMarkovGraphsContainer();
this.t_estimators_per_partition[p] = new MarkovEstimator(catalogContext, p_estimator, this.markovs_per_partition[p]);
this.costmodels_per_partition[p] = new MarkovCostModel(catalogContext, p_estimator, this.t_estimators_per_partition[p], thresholds);
} // FOR
}
public void init(AbstractClusterer clusterer) {
this.clusterer = clusterer;
}
@Override
public boolean isInitialized() {
return (this.clusterer != null);
}
public void finish() {
this.clusterer = null;
this.cluster_ids.clear();
for (int p : FeatureClusterer.this.all_partitions) {
this.clusters_per_partition[p].clear();
this.markovs_per_partition[p].clear();
// It's lame, but we need to put this here so that...
this.costmodels_per_partition[p] = new MarkovCostModel(catalogContext, p_estimator, this.t_estimators_per_partition[p], thresholds);
} // FOR
// Reset Counters
for (int i = 0; i < this.c_counters.length; i++) {
this.c_counters[i] = 0;
this.t_counters[i] = 0;
} // FOR
}
}
// ----------------------------------------------------------------------------
// DATA MEMBERS
// ----------------------------------------------------------------------------
/**
* We also maintain a "global" MarkovGraphContainer that consumes all transactions
* We will use to compare whether our cluster-specific models do better than the global one
* This is automatically connected to the FeatureClusterer's base partition cache, so we
* don't have to do anything special to get out what we need here
*/
private final MarkovGraphsContainer global_markov = new MarkovGraphsContainer() {
public MarkovGraph getFromParams(long txn_id, int base_partition, Object[] params, Procedure catalog_proc) {
return (this.get(base_partition, catalog_proc));
};
};
/**
* Global Cost Model
*/
private final MarkovCostModel global_costmodel;
/**
* Global TransactionEstimator
*/
private final MarkovEstimator global_t_estimator;
/**
* Global Counters
*/
private double total_g_cost = 0.0d;
private int g_counters[] = new int[] {
0, // Single-P
0, // Multi-P
0, // Known Clusters
};
private final Instances splits[] = new Instances[SplitType.values().length];
private final double split_percentages[] = new double[SplitType.values().length];
private final int split_counts[] = new int[SplitType.values().length];
private final CatalogContext catalogContext;
private final Procedure catalog_proc;
private final Workload workload;
private final EstimationThresholds thresholds;
private final PartitionEstimator p_estimator;
private final Random rand = new Random(); // FIXME
private final PartitionSet all_partitions;
private final int total_num_partitions;
private final FastObjectPool<ExecutionState> state_pool = new FastObjectPool<ExecutionState>(new ExecutionStateFactory(this));
/** We want to have just one thread pool for calculate threads */
private final ExecutorService calculate_threadPool;
private double round_topk = DEFAULT_ATTRIBUTESET_TOP_K;
private int num_rounds = DEFAULT_NUM_ROUNDS;
private final Map<Long, PartitionSet> cache_all_partitions = new HashMap<Long, PartitionSet>();
private final Map<Long, Integer> cache_base_partition = new HashMap<Long, Integer>();
/**
* Constructor
* @param catalogContext
* @param catalog_proc
* @param workload
* @param correlations
* @param all_partitions
* @param num_threads
*/
public FeatureClusterer(CatalogContext catalogContext, Procedure catalog_proc, Workload workload, PartitionSet all_partitions, int num_threads) {
this.catalogContext = catalogContext;
this.catalog_proc = catalog_proc;
this.workload = workload;
this.thresholds = new EstimationThresholds(); // FIXME
this.p_estimator = new PartitionEstimator(catalogContext);
this.all_partitions = all_partitions;
this.total_num_partitions = catalogContext.numberOfPartitions;
this.calculate_threadPool = Executors.newFixedThreadPool(num_threads);
for (SplitType type : SplitType.values()) {
this.split_percentages[type.ordinal()] = type.percentage;
} // FOR
this.global_t_estimator = new MarkovEstimator(this.catalogContext, this.p_estimator, this.global_markov);
this.global_costmodel = new MarkovCostModel(catalogContext, this.p_estimator, this.global_t_estimator, this.thresholds);
for (Integer p : FeatureClusterer.this.all_partitions) {
this.global_markov.getOrCreate(p, FeatureClusterer.this.catalog_proc).initialize();
} // FOR
}
/**
* Constructor
*/
public FeatureClusterer(CatalogContext catalogContext, Procedure catalog_proc, Workload workload, PartitionSet all_partitions) {
this(catalogContext, catalog_proc, workload, all_partitions, DEFAULT_NUM_THREADS);
}
protected final void cleanup() {
// this.generate_threadPool.shutdownNow();
this.calculate_threadPool.shutdownNow();
}
public void setNumRounds(int numRounds) {
this.num_rounds = numRounds;
if (debug.val) LOG.debug("Number of Rounds: " + numRounds);
}
public void setSplitPercentage(SplitType type, double percentage) {
this.split_percentages[type.ordinal()] = percentage;
if (debug.val) LOG.debug(String.format("%s Split Percentage: ", type.name(), percentage));
}
public void setAttributeTopK(double topk) {
this.round_topk = topk;
if (debug.val) LOG.debug("Attribute Top-K: " + topk);
}
protected MarkovCostModel getGlobalCostModel() {
return this.global_costmodel;
}
protected MarkovGraphsContainer getGlobalMarkovGraphs() {
return this.global_markov;
}
protected int[] getGlobalCounters() {
return (this.g_counters);
}
/**
*
* @param data
* @return
*/
protected Instances[] splitWorkload(Instances data) {
int offset = 0;
int all_cnt = data.numInstances();
for (SplitType stype : SplitType.values()) {
int idx = stype.ordinal();
this.split_counts[idx] = (int)Math.round(all_cnt * stype.percentage);
try {
this.splits[idx] = new Instances(data, offset, this.split_counts[idx]);
// Apply NumericToNominal filter!
NumericToNominal filter = new NumericToNominal();
filter.setInputFormat(this.splits[idx]);
this.splits[idx] = Filter.useFilter(this.splits[idx], filter);
} catch (Exception ex) {
throw new RuntimeException("Failed to split " + stype + " workload", ex);
}
offset += this.split_counts[idx];
if (debug.val) LOG.debug(String.format("%-12s%d", stype.toString()+":", this.split_counts[idx]));
} // FOR
return (this.splits);
}
// ----------------------------------------------------------------------------
// CACHING METHODS
// ----------------------------------------------------------------------------
private int getBasePartition(TransactionTrace txn_trace) {
Long txn_id = Long.valueOf(txn_trace.getTransactionId());
Integer base_partition = this.cache_base_partition.get(txn_id);
if (base_partition == null) {
try {
base_partition = this.p_estimator.getBasePartition(txn_trace);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
this.cache_base_partition.put(txn_id, base_partition);
}
return (base_partition.intValue());
}
private PartitionSet getAllPartitions(TransactionTrace txn_trace) {
Long txn_id = Long.valueOf(txn_trace.getTransactionId());
PartitionSet all_partitions = this.cache_all_partitions.get(txn_id);
if (all_partitions == null) {
all_partitions = new PartitionSet();
try {
this.p_estimator.getAllPartitions(all_partitions, txn_trace);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
this.cache_all_partitions.put(txn_id, all_partitions);
}
return (all_partitions);
}
// ----------------------------------------------------------------------------
// CALCULATION METHODS
// ----------------------------------------------------------------------------
/**
*
* @param fset
* @param data
* @param catalog_proc
* @throws Exception
*/
@SuppressWarnings("unchecked")
protected MarkovAttributeSet calculate(final Instances data) throws Exception {
// ----------------------------------------------------------------------------
// Split the input data set into separate data sets
// ----------------------------------------------------------------------------
if (debug.val) LOG.debug(String.format("Splitting %d instances", data.numInstances()));
this.splitWorkload(data);
// ----------------------------------------------------------------------------
// Calculate global information
// ----------------------------------------------------------------------------
if (debug.val) LOG.debug("Calculating Global MarkovGraph cost");
this.calculateGlobalCost();
// ----------------------------------------------------------------------------
// Perform Feed-Forward Selection
// ----------------------------------------------------------------------------
Attribute base_partition_attr = data.attribute(FeatureUtil.getFeatureKeyPrefix(BasePartitionFeature.class));
assert(base_partition_attr != null);
Integer base_partition_idx = base_partition_attr.index();
assert(base_partition_idx != null);
// Get the list of all the attributes that we are going to want to try to cluster on
// We want to always remove the first attribute because that's the TransactionId
List<Attribute> temp = (List<Attribute>)CollectionUtil.addAll(new ArrayList<Attribute>(), data.enumerateAttributes());
// Remove the TransactionId and BasePartition features
temp.remove(FeatureExtractor.TXNID_ATTRIBUTE_IDX);
temp.remove(base_partition_idx);
Collections.shuffle(temp, this.rand);
ListOrderedSet<Attribute> all_attributes = new ListOrderedSet<Attribute>();
all_attributes.addAll(temp);
// List of all AttributeSets ever created
final SortedSet<MarkovAttributeSet> all_asets = new TreeSet<MarkovAttributeSet>();
// The AttributeSets created in each round
final SortedSet<MarkovAttributeSet> round_asets = new TreeSet<MarkovAttributeSet>();
final Map<MarkovAttributeSet, AbstractClusterer> round_clusterers = new HashMap<MarkovAttributeSet, AbstractClusterer>();
// The best AttributeSet + Clusterer we've seen thus far
MarkovAttributeSet best_aset = null;
AbstractClusterer best_clusterer = null;
boolean found_new_best = true;
int round = 0;
while (round++ < this.num_rounds && found_new_best) {
round_asets.clear();
round_clusterers.clear();
if (debug.val) {
Map<String, Object> m0 = new ListOrderedMap<String, Object>();
m0.put("Round #", String.format("%02d", round));
m0.put("Number of Partitions", this.all_partitions.size());
m0.put("Number of Attributes", all_attributes.size());
m0.put("Best Set", best_aset);
m0.put("Best Cost", (best_aset != null ? best_aset.getCost() : null));
Map<String, Object> m1 = new ListOrderedMap<String, Object>();
for (SplitType stype : SplitType.values()) {
String key = String.format("# of %s Instances", stype.name());
String val = String.format("%-8s [%.02f]", this.split_counts[stype.ordinal()], stype.percentage);
m1.put(key, val);
} // FOR
LOG.debug("\n" + StringUtil.formatMaps(":", true, true, false, false, true, true, m0, m1));
}
final Iterable<Set<Attribute>> it = UniqueCombinationIterator.factory(all_attributes, round);
final List<Set<Attribute>> sets = (List<Set<Attribute>>)CollectionUtil.addAll(new ArrayList<Set<Attribute>>(), it);
final int num_sets = sets.size();
final CountDownLatch latch = new CountDownLatch(num_sets);
final AtomicInteger aset_ctr = new AtomicInteger(0);
for (final Set<Attribute> s : sets) {
Runnable r = new Runnable() {
@Override
public void run() {
MarkovAttributeSet aset = new MarkovAttributeSet(s);
AbstractClusterer clusterer = null;
// if (aset_ctr.get() <= 0) {
if (trace.val) LOG.trace("Constructing AttributeSet: " + aset);
try {
clusterer = FeatureClusterer.this.calculateAttributeSetCost(aset);
} catch (Exception ex) {
LOG.fatal("Failed to calculate MarkovAttributeSet cost for " + aset, ex);
throw new RuntimeException(ex);
}
assert(aset != null);
assert(clusterer != null);
round_asets.add(aset);
round_clusterers.put(aset, clusterer);
all_asets.add(aset);
if (debug.val) {
int my_ctr = aset_ctr.getAndIncrement();
LOG.debug(String.format("[%03d] %s => %.03f", my_ctr, aset, aset.getCost()));
}
// }
latch.countDown();
}
};
this.calculate_threadPool.execute(r);
} // FOR
// Wait until they all finish
if (debug.val) LOG.debug(String.format("Waiting for %d calculateAttributeSetCosts threads to finish", num_sets));
latch.await();
// Now figure out what the top-k MarkovAttributeSets from this round
// For now we'll explode out all of the attributes that they contain and throw that into a set
// of candidate attributes for the next round
all_attributes.clear();
int top_k = (int)Math.round(round_asets.size() * this.round_topk);
for (MarkovAttributeSet aset : round_asets) {
all_attributes.addAll(aset);
if (debug.val) LOG.debug(String.format("%.03f\t%s", aset.getCost(), aset.toString()));
if (top_k-- == 0) break;
} // FOR
// if (round == 1) all_attributes.add(data.attribute(1));
MarkovAttributeSet round_best = round_asets.first();
assert(round_best != null);
if (best_aset == null || round_best.getCost() < best_aset.getCost()) {
best_aset = round_best;
best_clusterer = round_clusterers.get(round_best);
} else {
found_new_best = false;
}
if (debug.val) LOG.debug(String.format("Next Round Attributes [size=%d]: %s", all_attributes.size(), MarkovAttributeSet.toString(all_attributes)));
} // WHILE (round)
this.generateDecisionTree(best_clusterer, best_aset, data);
return (best_aset);
}
/**
* Calculate the cost of a global MarkovGraph estimator
* @throws Exception
*/
protected void calculateGlobalCost() throws Exception {
final Instances trainingData = this.splits[SplitType.TRAINING.ordinal()];
assert(trainingData != null);
final Instances validationData = this.splits[SplitType.VALIDATION.ordinal()];
assert(validationData != null);
// ----------------------------------------------------------------------------
// BUILD GLOBAL MARKOVGRAPH
// ----------------------------------------------------------------------------
for (int i = 0, cnt = trainingData.numInstances(); i < cnt; i++) {
// Grab the Instance and throw it at the the clusterer to get the target cluster
// The original data set is going to have the txn id that we need to grab
// the proper TransactionTrace record from the workload
Instance inst = trainingData.instance(i);
long txn_id = FeatureUtil.getTransactionId(inst);
TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
assert(txn_trace != null) : "Invalid TxnId #" + txn_id + "\n" + inst;
// Figure out which base partition this txn would execute on
// because we want divide the MarkovGraphContainers by the base partition
int base_partition = this.getBasePartition(txn_trace);
// Update Global MarkovGraph
MarkovGraph markov = this.global_markov.get(base_partition, this.catalog_proc);
assert(markov != null) : "Failed to get Global MarkovGraph for partition #" + base_partition;
markov.processTransaction(txn_trace, this.p_estimator);
} // FOR
// ----------------------------------------------------------------------------
// BUILD GLOBAL COST MODELS
// ----------------------------------------------------------------------------
for (Integer partition : FeatureClusterer.this.all_partitions) {
MarkovGraph m = this.global_markov.get(partition, this.catalog_proc);
assert(m != null);
m.calculateProbabilities(catalogContext.getAllPartitionIds());
assert(m.isValid()) : "The MarkovGraph at Partition #" + partition + " is not valid!";
} // FOR
if (debug.val) LOG.debug(String.format("Finished initializing GLOBAL MarkovCostModel"));
// ----------------------------------------------------------------------------
// ESTIMATE GLOBAL COST
// ----------------------------------------------------------------------------
int validationCnt = validationData.numInstances();
int recalculate_ctr = 0;
for (int i = 0; i < validationCnt; i++) {
if (trace.val && i > 0 && i % 1000 == 0) LOG.trace(String.format("TransactionTrace %d/%d", i, validationCnt));
Instance inst = validationData.instance(i);
long txn_id = FeatureUtil.getTransactionId(inst);
TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
assert(txn_trace != null);
int base_partition = this.getBasePartition(txn_trace);
// Skip any txn that executes on a partition that we're not evaluating
if (this.all_partitions.contains(base_partition) == false) continue;
// Ok so now let's figure out what this mofo is going to do...
PartitionSet partitions = this.getAllPartitions(txn_trace);
boolean singlepartitioned = (partitions.size() == 1);
// Estimate Global MarkovGraph Cost
double g_cost = this.global_costmodel.estimateTransactionCost(catalogContext, txn_trace);
if (g_cost > 0) {
this.total_g_cost += g_cost;
this.g_counters[singlepartitioned ? 0 : 1]++;
MarkovGraph m = this.global_markov.get(base_partition, this.catalog_proc);
assert(m != null);
m.processTransaction(txn_trace, p_estimator);
// m.calculateProbabilities();
recalculate_ctr++;
}
} // FOR
if (debug.val) LOG.debug(String.format("Recalculated global probabilities %d out of %d times", recalculate_ctr, validationCnt));
}
protected Map<Integer, MarkovGraphsContainer> constructMarkovModels(MarkovAttributeSet aset, Instances data) throws Exception {
// Create an ExecutionState for this run
ExecutionState state = (ExecutionState)this.state_pool.borrowObject();
state.init(this.createClusterer(aset, data));
// Construct the MarkovGraphs for each Partition/Cluster using the Training Data Set
this.generateMarkovGraphs(state, data);
// Generate the MarkovModels for the different partitions+clusters
this.generateMarkovCostModels(state);
Map<Integer, MarkovGraphsContainer> ret = new HashMap<Integer, MarkovGraphsContainer>();
for (int p = 0; p < state.markovs_per_partition.length; p++) {
ret.put(p, state.markovs_per_partition[p]);
} // FOR
return (ret);
}
/**
*
* @param catalog_proc
* @param attributes
* @param trainingData
* @param validationData
* @return
* @throws Exception
*/
public AbstractClusterer calculateAttributeSetCost(final MarkovAttributeSet aset) throws Exception {
// Build our clusterer
if (debug.val) LOG.debug("Training Clusterer - " + aset);
AbstractClusterer clusterer = this.createClusterer(aset, this.splits[SplitType.TRAINING.ordinal()]);
// Create an ExecutionState for this run
ExecutionState state = (ExecutionState)this.state_pool.borrowObject();
state.init(clusterer);
// Construct the MarkovGraphs for each Partition/Cluster using the Training Data Set
this.generateMarkovGraphs(state, this.splits[SplitType.TRAINING.ordinal()]);
// Generate the MarkovModels for the different partitions+clusters
this.generateMarkovCostModels(state);
// Now we need a mapping from TransactionIds -> ClusterIds
// And then calculate the cost of using our cluster configuration to predict txn paths
double total_c_cost = 0.0d;
int c_counters[] = state.c_counters;
int t_counters[] = state.t_counters;
// Map<Pair<Long, Integer>, Histogram> key_to_cluster = new TreeMap<Pair<Long, Integer>, Histogram>();
// Map<Integer, Histogram> cluster_to_key = new TreeMap<Integer, Histogram>();
Instances validationData = this.splits[SplitType.VALIDATION.ordinal()];
int validationCnt = this.split_counts[SplitType.VALIDATION.ordinal()];
if (debug.val) LOG.debug(String.format("Estimating prediction rates of clusterer with %d transactions...", validationCnt));
for (int i = 0; i < validationCnt; i++) {
if (i > 0 && i % 1000 == 0) LOG.trace(String.format("TransactionTrace %d/%d", i, validationCnt));
Instance inst = validationData.instance(i);
long txn_id = FeatureUtil.getTransactionId(inst);
TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
assert(txn_trace != null);
Integer base_partition = this.getBasePartition(txn_trace);
// Skip any txn that executes on a partition that we're not evaluating
if (this.all_partitions.contains(base_partition) == false) continue;
int c = (int)clusterer.clusterInstance(inst);
// Debug Stuff
// Pair<Long, Integer> key = Pair.of((Long)txn_trace.getParam(1), ((Object[])txn_trace.getParam(4)).length);
// if (key_to_cluster.containsKey(key) == false) key_to_cluster.put(key, new Histogram());
// key_to_cluster.get(key).put(c);
// if (cluster_to_key.containsKey(c) == false) cluster_to_key.put(c, new Histogram());
// cluster_to_key.get(c).put(key);
// if (debug.val) LOG.debug(String.format("[%s, %s] => %d", , c));
// Ok so now let's figure out what this mofo is going to do...
PartitionSet partitions = this.getAllPartitions(txn_trace);
boolean singlepartitioned = (partitions.size() == 1);
t_counters[singlepartitioned ? 0 : 1]++;
t_counters[2]++; // Total # of Txns
// Estimate Clusterer MarkovGraphCost
MarkovCostModel c_costmodel = state.costmodels_per_partition[base_partition.intValue()];
double c_cost = 0.0;
TxnToClusterMarkovGraphsContainer markovs = state.markovs_per_partition[base_partition.intValue()];
markovs.addTransactionClusterXref(txn_id, c);
MarkovGraph markov = markovs.get(c, catalog_proc);
// Check that this is a cluster that we've seen before at this partition
if (markov == null) {
if (trace.val) LOG.warn(String.format("Txn #%d was mapped to never before seen Cluster #%d at partition %d", txn_id, c, base_partition));
markov = markovs.getOrCreate(c, this.catalog_proc).initialize();
markovs.addTransactionClusterXref(txn_id, c);
// state.t_estimators_per_partition[base_partition.intValue()].processTransactionTrace(txn_trace);
c_counters[2]++; // Unknown Clusters
}
c_cost = c_costmodel.estimateTransactionCost(catalogContext, txn_trace);
if (c_cost > 0) {
total_c_cost += c_cost;
c_counters[singlepartitioned ? 0 : 1]++;
// So that we can improve our predictions...
markov.processTransaction(txn_trace, p_estimator);
markov.calculateProbabilities(catalogContext.getAllPartitionIds());
// if (c_counters[singlepartitioned ? 0 : 1] == 1) {
//// MarkovPathEstimator.LOG.setLevel(Level.TRACE);
//// MarkovPathEstimator estimator = new MarkovPathEstimator(markov, c_costmodel.getTransactionEstimator(c), base_partition, txn_trace.getParams());
//// estimator.traverse(markov.getStartVertex());
//// List<Vertex> e_path = estimator.getVisitPath();
//
// List<Vertex> e_path = c_costmodel.getLastEstimatedPath();
// List<Vertex> a_path = c_costmodel.getLastActualPath();
// for (int ii = 0, cnt = Math.max(e_path.size(), a_path.size()); ii < cnt; ii++) {
// Vertex e = (ii < e_path.size() ? e_path.get(ii) : null);
// Vertex a = (ii < a_path.size() ? a_path.get(ii) : null);
// String match = (e != null && e.equals(a) ? "" : "***");
// System.err.println(String.format("%-60s%-10s%s", e, match, a));
// } // FOR
//
// System.err.println("singlepartitioned = " + singlepartitioned);
// System.err.println("cost = " + c_cost);
// System.err.println("all_partitions = " + all_partitions);
// System.err.println("actual partitions (R/W) = " + c_costmodel.getReadWritePartitions(a_path));
// System.err.println("estimated partitions (R/W) = " + c_costmodel.getReadWritePartitions(e_path));
// System.err.println(txn_trace.debug(catalog_db));
//
// LOG.debug("Writing out mispredicated MarkovGraph paths [c_cost=" + c_cost + "]");
// GraphvizExport<Vertex, Edge> gv = MarkovUtil.exportGraphviz(markov, false, markov.getPath(c_costmodel.getLastEstimatedPath()));
// gv.highlightPath(markov.getPath(c_costmodel.getLastActualPath()), "blue");
// System.err.println("GRAPHVIZ: " + gv.writeToTempFile(catalog_proc, (singlepartitioned ? "single" : "multi")));
// System.err.println();
//// System.exit(1);
//
// wrote_gv = true;
// // if (temp++ == 1) System.exit(1);
// }
}
} // FOR
if (debug.val) LOG.debug("Results: " + aset + "\n" + debugCounters(validationCnt, t_counters, c_counters, this.g_counters));
this.state_pool.returnObject(state);
aset.setCost(total_c_cost);
return (clusterer);
}
/**
*
* @param state
* @param trainingData
* @throws Exception
*/
protected void generateMarkovGraphs(ExecutionState state, Instances trainingData) throws Exception {
// Now iterate over validation set and construct Markov models
// We have to know which field is our txn_id so that we can quickly access it
int trainingCnt = trainingData.numInstances();
if (trace.val) LOG.trace(String.format("Training MarkovGraphs using %d instances", trainingCnt));
ObjectHistogram<Integer> cluster_h = new ObjectHistogram<Integer>();
ObjectHistogram<Integer> partition_h = new ObjectHistogram<Integer>();
for (int i = 0; i < trainingCnt; i++) {
// Grab the Instance and throw it at the the clusterer to get the target cluster
// The original data set is going to have the txn id that we need to grab
// the proper TransactionTrace record from the workload
Instance inst = trainingData.instance(i);
int c = (int)state.clusterer.clusterInstance(inst);
cluster_h.put(c);
long txn_id = Long.valueOf(inst.stringValue(FeatureExtractor.TXNID_ATTRIBUTE_IDX));
TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
assert(txn_trace != null) : "Invalid TxnId #" + txn_id + "\n" + inst;
// Figure out which base partition this txn would execute on
// because we want divide the MarkovGraphContainers by the base partition
int base_partition = this.p_estimator.getBasePartition(txn_trace);
partition_h.put(base_partition);
// Build up the MarkovGraph for this specific cluster
MarkovGraphsContainer markovs = state.markovs_per_partition[base_partition];
MarkovGraph markov = markovs.get(c, this.catalog_proc);
if (markov == null) {
markov = markovs.getOrCreate(c, this.catalog_proc).initialize();
markovs.put(c, markov);
}
markov.processTransaction(txn_trace, this.p_estimator);
state.clusters_per_partition[base_partition].put(c);
} // FOR
// if (trace.val) LOG.trace("Clusters per Partition:\n" + StringUtil.formatMaps(state.clusters_per_partition));
}
/**
*
* @param state
*/
protected void generateMarkovCostModels(final ExecutionState state) {
// Now use the validation data set to figure out how well we are able to predict transaction
// execution paths using the trained Markov graphs
// We first need to construct a new costmodel and populate it with TransactionEstimators
if (trace.val) LOG.trace("Constructing CLUSTER-BASED MarkovCostModels");
// IMPORTANT: We run out of memory if we try to build the MarkovGraphs for all of the
// partitions+clusters. So instead we are going to randomly select some of the partitions to be used in the
// cost model estimation.
final CountDownLatch costmodel_latch = new CountDownLatch(this.all_partitions.size());
if (trace.val) LOG.trace(String.format("Generating MarkovGraphs for %d partitions", costmodel_latch.getCount()));
for (final int partition : this.all_partitions) {
final MarkovGraphsContainer markovs = state.markovs_per_partition[partition];
if (trace.val) LOG.trace(String.format("Calculating Partition #%d probabilities for %d clusters", partition, markovs.size()));
for (Entry<Integer, Map<Procedure, MarkovGraph>> e : markovs.entrySet()) {
// if (debug.val) LOG.debug(String.format("Partition %d - Cluster %d", partition, i++));
// Calculate the probabilities for each graph
for (MarkovGraph markov : e.getValue().values()) {
markov.calculateProbabilities(catalogContext.getAllPartitionIds());
} // FOR
} // FOR
if (trace.val) LOG.trace(String.format("Finished processing MarkovGraphs for Partition #%d [count=%d]", partition, costmodel_latch.getCount()));
costmodel_latch.countDown();
// this.generate_threadPool.execute(r);
} // FOR
// // Wait until everyone finishes
// try {
// costmodel_latch.await();
// } catch (Exception ex) {
// throw new RuntimeException(ex);
// }
}
/**
*
* @param trainingData
* @param round
* @throws Exception
*/
protected AbstractClusterer createClusterer(MarkovAttributeSet aset, Instances trainingData) throws Exception {
if (trace.val) LOG.trace(String.format("Clustering %d %s instances with %d attributes", trainingData.numInstances(), CatalogUtil.getDisplayName(catalog_proc), aset.size()));
// Create the filter we need so that we only include the attributes in the given MarkovAttributeSet
Filter filter = aset.createFilter(trainingData);
// Using our training set to build the clusterer
int seed = this.rand.nextInt();
// SimpleKMeans inner_clusterer = new SimpleKMeans();
EM inner_clusterer = new EM();
String options[] = {
"-N", Integer.toString(1000), // num_partitions),
"-S", Integer.toString(seed),
"-I", Integer.toString(100),
};
inner_clusterer.setOptions(options);
FilteredClusterer filtered_clusterer = new FilteredClusterer();
filtered_clusterer.setFilter(filter);
filtered_clusterer.setClusterer(inner_clusterer);
AbstractClusterer clusterer = filtered_clusterer;
clusterer.buildClusterer(trainingData);
return (clusterer);
}
protected Classifier generateDecisionTree(AbstractClusterer clusterer, MarkovAttributeSet aset, Instances data) throws Exception {
// We need to create a new Attribute that has the ClusterId
Instances newData = data; // new Instances(data);
newData.insertAttributeAt(new Attribute("ClusterId"), newData.numAttributes());
Attribute cluster_attr = newData.attribute(newData.numAttributes()-1);
assert(cluster_attr != null);
assert(cluster_attr.index() > 0);
newData.setClass(cluster_attr);
// We will then tell the Classifier to predict that ClusterId based on the MarkovAttributeSet
ObjectHistogram<Integer> cluster_h = new ObjectHistogram<Integer>();
for (int i = 0, cnt = newData.numInstances(); i < cnt; i++) {
// Grab the Instance and throw it at the the clusterer to get the target cluster
Instance inst = newData.instance(i);
int c = (int)clusterer.clusterInstance(inst);
inst.setClassValue(c);
cluster_h.put(c);
} // FOR
System.err.println("Number of Elements: " + cluster_h.getValueCount());
System.err.println(cluster_h);
NumericToNominal filter = new NumericToNominal();
filter.setInputFormat(newData);
newData = Filter.useFilter(newData, filter);
String output = this.catalog_proc.getName() + "-labeled.arff";
FileUtil.writeStringToFile(output, newData.toString());
LOG.info("Wrote labeled data set to " + output);
// Decision Tree
J48 j48 = new J48();
String options[] = {
"-S", Integer.toString(this.rand.nextInt()),
};
j48.setOptions(options);
// Make sure we add the ClusterId attribute to a new MarkovAttributeSet so that
// we can tell the Classifier to classify that!
FilteredClassifier fc = new FilteredClassifier();
MarkovAttributeSet classifier_aset = new MarkovAttributeSet(aset);
classifier_aset.add(cluster_attr);
fc.setFilter(classifier_aset.createFilter(newData));
fc.setClassifier(j48);
// Bombs away!
fc.buildClassifier(newData);
return (fc);
}
/**
* Helper method to convet Feature keys to Attributes
* @param data
* @param prefixes
* @return
*/
public static Set<Attribute> prefix2attributes(Instances data, String...prefixes) {
Set<Attribute> attributes = new ListOrderedSet<Attribute>();
for (String key : prefixes) {
Attribute attribute = data.attribute(key);
assert(attribute != null) : "Invalid Attribute key '" + key + "'";
attributes.add(attribute);
} // FOR
return (attributes);
}
protected static String debugCounters(int validationCnt, int t_counters[], int c_counters[], int g_counters[]) {
int values[][] = new int[][]{
t_counters,
c_counters,
g_counters,
};
String labels[] = {
"Prediction Result",
"Single-Partition",
"Multi-Partition",
"Unknown Clusters",
};
int totals[] = {
t_counters[2],
t_counters[0],
t_counters[1],
t_counters[2],
};
final int total_txns = values[0][2];
final int value_len = Integer.toString(total_txns).length();
final String f = "%" + value_len + "d / %" + value_len + "d [%.03f]";
final ListOrderedMap<?, ?> maps[] = new ListOrderedMap<?, ?>[values.length];
for (int i = 0; i < values.length; i++) {
ListOrderedMap<String, String> m = new ListOrderedMap<String, String>();
int singlep = values[i][0];
int multip = values[i][1];
int missed = values[i][2];
if (i == 0) {
m.put("# of Evaluated Transactions", String.format(f, total_txns, validationCnt, (total_txns / (double)validationCnt)));
} else {
String prefix = (i == 1 ? "Clusterer" : "Global");
int inner[] = new int[]{
singlep + multip,
singlep,
multip,
total_txns - missed,
};
for (int ii = 0; ii < inner.length; ii++) {
String value = String.format(f, totals[ii] - inner[ii], // Count
totals[ii], // Total
1.0 - (inner[ii] / (double)totals[ii]) // Percentage
);
m.put(prefix + " " + labels[ii], value);
} // FOR
}
maps[i] = m;
} // FOR
return (StringUtil.formatMaps(maps));
}
/**
* Main!
* @param vargs
* @throws Exception
*/
public static void main(String[] vargs) throws Exception {
ArgumentsParser args = ArgumentsParser.load(vargs);
args.require(
ArgumentsParser.PARAM_CATALOG,
ArgumentsParser.PARAM_WORKLOAD,
ArgumentsParser.PARAM_MAPPINGS
);
// Number of threads
int num_threads = FeatureClusterer.DEFAULT_NUM_THREADS;
if (args.hasIntParam(ArgumentsParser.PARAM_MARKOV_THREADS)) {
num_threads = args.getIntParam(ArgumentsParser.PARAM_MARKOV_THREADS);
}
// Get the procedure we're suppose to investigate
String proc_name = args.getOptParam(0);
Procedure catalog_proc = args.catalog_db.getProcedures().getIgnoreCase(proc_name);
assert(catalog_proc != null) : proc_name;
// And our Weka data file
// File arff_path = new File(args.getOptParam(1));
// assert(arff_path.exists()) : arff_path.getAbsolutePath();
// BufferedReader reader = new BufferedReader(new FileReader(arff_path));
// Instances data = new Instances(reader);
// reader.close();
// data = new Instances(data, 0, args.workload.getTransactionCount());
Instances data = null;
{
// Hopefully this will get garbage collected if we put it here...
FeatureExtractor fextractor = new FeatureExtractor(args.catalogContext);
Map<Procedure, FeatureSet> fsets = fextractor.calculate(args.workload);
FeatureSet fset = fsets.get(catalog_proc);
assert(fset != null) : "Failed to get FeatureSet for " + catalog_proc;
data = fset.export(catalog_proc.getName());
}
assert(data != null);
assert(args.workload.getTransactionCount() == data.numInstances());
PartitionSet partitions = null;
if (args.hasParam(ArgumentsParser.PARAM_WORKLOAD_RANDOM_PARTITIONS)) {
PartitionEstimator p_estimator = new PartitionEstimator(args.catalogContext);
final ObjectHistogram<Integer> h = new ObjectHistogram<Integer>();
for (TransactionTrace txn_trace : args.workload.getTransactions()) {
int base_partition = p_estimator.getBasePartition(txn_trace);
h.put(base_partition);
} // FOR
// System.err.println("# OF PARTITIONS: " + h.getValueCount());
// h.setKeepZeroEntries(true);
// for (Integer p : CatalogUtil.getAllPartitionIds(args.catalog_db)) {
// if (h.contains(p) == false) h.put(p, 0);
// }
// System.err.println(h);
// System.exit(1);
//
partitions = new PartitionSet(h.values());
} else {
partitions = args.catalogContext.getAllPartitionIds();
}
FeatureClusterer fclusterer = new FeatureClusterer(args.catalogContext,
catalog_proc,
args.workload,
partitions,
num_threads);
// Update split configuration variables
for (SplitType type : SplitType.values()) {
String param_name = String.format("%s.%s", ArgumentsParser.PARAM_MARKOV_SPLIT, type.name());
if (args.hasDoubleParam(param_name) == false) continue;
double percentage = args.getDoubleParam(param_name);
fclusterer.setSplitPercentage(type, percentage);
} // FOR
if (args.hasDoubleParam(ArgumentsParser.PARAM_MARKOV_TOPK)) {
fclusterer.setAttributeTopK(args.getDoubleParam(ArgumentsParser.PARAM_MARKOV_TOPK));
}
if (args.hasIntParam(ArgumentsParser.PARAM_MARKOV_ROUNDS)) {
fclusterer.setNumRounds(args.getIntParam(ArgumentsParser.PARAM_MARKOV_ROUNDS));
}
// MarkovAttributeSet aset = fclusterer.calculate(data);
// HACK
Set<Attribute> attributes = FeatureClusterer.prefix2attributes(data,
"ParamArrayLength-04"
// "ParamHashPartition-01"
);
MarkovAttributeSet aset = new MarkovAttributeSet(attributes);
Map<Integer, MarkovGraphsContainer> markovs = fclusterer.constructMarkovModels(aset, data);
File output = new File(catalog_proc.getName() + ".markovs");
MarkovGraphsContainerUtil.save(markovs, output);
// fclusterer.calculateGlobalCost();
// AbstractClusterer clusterer = fclusterer.calculateAttributeSetCost(aset);
// fclusterer.generateDecisionTree(clusterer, aset, data);
//
// System.err.println(aset + "\nCost: " + aset.getCost());
fclusterer.cleanup();
}
}