Source Code of edu.brown.markov.FeatureClusterer

package edu.brown.markov;


import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;


import org.apache.commons.collections15.map.ListOrderedMap;
import org.apache.commons.collections15.set.ListOrderedSet;
import org.apache.commons.pool.BasePoolableObjectFactory;
import org.apache.log4j.Logger;
import org.voltdb.CatalogContext;
import org.voltdb.catalog.Procedure;


import weka.classifiers.Classifier;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.clusterers.AbstractClusterer;
import weka.clusterers.EM;
import weka.clusterers.FilteredClusterer;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.NumericToNominal;
import edu.brown.catalog.CatalogUtil;
import edu.brown.costmodel.MarkovCostModel;
import edu.brown.hstore.estimators.markov.MarkovEstimator;
import edu.brown.logging.LoggerUtil;
import edu.brown.logging.LoggerUtil.LoggerBoolean;
import edu.brown.markov.containers.MarkovGraphsContainerUtil;
import edu.brown.markov.containers.MarkovGraphsContainer;
import edu.brown.markov.features.BasePartitionFeature;
import edu.brown.markov.features.FeatureUtil;
import edu.brown.pools.FastObjectPool;
import edu.brown.pools.Poolable;
import edu.brown.statistics.ObjectHistogram;
import edu.brown.utils.ArgumentsParser;
import edu.brown.utils.CollectionUtil;
import edu.brown.utils.FileUtil;
import edu.brown.utils.PartitionEstimator;
import edu.brown.utils.PartitionSet;
import edu.brown.utils.StringUtil;
import edu.brown.utils.UniqueCombinationIterator;
import edu.brown.workload.TransactionTrace;
import edu.brown.workload.Workload;


/**
 * 
 * @author pavlo
 */
public class FeatureClusterer {
    private static final Logger LOG = Logger.getLogger(FeatureClusterer.class);
    private static final LoggerBoolean debug = new LoggerBoolean();
    private static final LoggerBoolean trace = new LoggerBoolean();
    static {
        LoggerUtil.attachObserver(LOG, debug, trace);
    }


    public enum SplitType {
        /** Training Workload Percentage */
        TRAINING        (0.40),
        
        /** Validation Workload Percentage */
        VALIDATION      (0.60),
        
        /** Testing Workload Percentage */        
        TESTING         (0.00);
        
        private final double percentage;
        
        private SplitType(double percentage) {
            this.percentage = percentage;
        }
        public double getPercentage() {
            return this.percentage;
        }
    }
    
    // ----------------------------------------------------------------------------
    // DEFAULT CONFIGURATION VALUES
    // ----------------------------------------------------------------------------
    
    /** For each search round, we will only propagate the attributes found this these top-k AttibuteSets */ 
    private static final double DEFAULT_ATTRIBUTESET_TOP_K = 0.10;
    
    /** Number of threads to use per thread pool */
    private static final int DEFAULT_NUM_THREADS = 2;
    
    /** Number of search rounds in findBestMarkovAttributeSet */
    private static final int DEFAULT_NUM_ROUNDS = 10;


    // ----------------------------------------------------------------------------
    // MARKOVGRAPHSCONTAINER WRAPPER
    // ----------------------------------------------------------------------------
    
    private static class TxnToClusterMarkovGraphsContainer extends MarkovGraphsContainer {
        
        /**
         * Hackish cross-reference table to go from the TransactionId to Cluster#
         */
        private final Map<Long, Integer> txnid_cluster_xref = new HashMap<Long, Integer>();
        
        @Override
        public MarkovGraph getFromParams(Long txn_id, int base_partition, Object[] params, Procedure catalog_proc) {
            // Look-up what cluster our TransactionTrace belongs to
            Integer cluster = this.txnid_cluster_xref.get(txn_id);
            assert(cluster != null) : "Failed to initialize TransactionId->Cluster# xref for txn #" + txn_id;
            return this.get(cluster, catalog_proc);
        }
        
        /**
         * Map a TransactionId to a ClusterId
         * @param txn_id
         * @param cluster_id
         */
        public void addTransactionClusterXref(long txn_id, int cluster_id) {
            this.txnid_cluster_xref.put(txn_id, cluster_id);
        }
        
        @Override
        public void clear() {
            super.clear();
            this.txnid_cluster_xref.clear();
        }
    }
    
    // ----------------------------------------------------------------------------
    // EXECUTION STATE
    // ----------------------------------------------------------------------------


    /**
     * ExecutionState Factory
     */
    private static class ExecutionStateFactory extends BasePoolableObjectFactory {
        private final FeatureClusterer fclusterer;
        
        public ExecutionStateFactory(FeatureClusterer fclusterer) {
            this.fclusterer = fclusterer;
        }
        @Override
        public Object makeObject() throws Exception {
            return this.fclusterer.new ExecutionState();
        }
        @Override
        public void passivateObject(Object obj) throws Exception {
            ExecutionState state = (ExecutionState)obj;
            state.finish();
        }
    } // END CLASS
    
    /**
     * 
     */
    private class ExecutionState implements Poolable {
        /**
         * Current Clusterer for this ExecutionState
         */
        AbstractClusterer clusterer;
        /**
         * Set of all the ClusterIds that we have seen
         */
        final Set<Integer> cluster_ids = new HashSet<Integer>();
        /**
         * We want to always split the MarkovGraphContainers by base partition, since we already know
         * that this is going to be the best predictor
         */
        final TxnToClusterMarkovGraphsContainer markovs_per_partition[];
        /**
         * Then we have a costmodel for each PartitionId 
         */
        final MarkovCostModel costmodels_per_partition[];
        /**
         * And a TransactionEstimator for each PartitionId
         */
        final MarkovEstimator t_estimators_per_partition[];
        /**
         * Histogram of Clusters Per Partition
         */
        final ObjectHistogram<Integer> clusters_per_partition[];
        
        int c_counters[] = new int[] {
            0,      // Single-P
            0,      // Multi-P
            0,      // Known Clusters
        };
        int t_counters[] = new int[] {
            0,      // Single-P
            0,      // Multi-P
            0,      // Total # of Txns
        };
        
        /**
         * Constructor
         */
        @SuppressWarnings("unchecked")
        private ExecutionState() {
            // We allocate a complete array for all of the partitions in the catalog
            this.markovs_per_partition = new TxnToClusterMarkovGraphsContainer[FeatureClusterer.this.total_num_partitions];
            this.costmodels_per_partition = new MarkovCostModel[FeatureClusterer.this.total_num_partitions];
            this.t_estimators_per_partition = new MarkovEstimator[FeatureClusterer.this.total_num_partitions];
            this.clusters_per_partition = (ObjectHistogram<Integer>[])new ObjectHistogram<?>[FeatureClusterer.this.total_num_partitions];
            
            // But then only initialize the partition-specific data structures
            for (int p : FeatureClusterer.this.all_partitions) {
                this.clusters_per_partition[p] = new ObjectHistogram<Integer>();
                this.markovs_per_partition[p] = new TxnToClusterMarkovGraphsContainer();
                this.t_estimators_per_partition[p] = new MarkovEstimator(catalogContext, p_estimator, this.markovs_per_partition[p]);
                this.costmodels_per_partition[p] = new MarkovCostModel(catalogContext, p_estimator, this.t_estimators_per_partition[p], thresholds);
            } // FOR
        }
        
        public void init(AbstractClusterer clusterer) {
            this.clusterer = clusterer;
        }
        
        @Override
        public boolean isInitialized() {
            return (this.clusterer != null);
        }
        
        public void finish() {
            this.clusterer = null;
            this.cluster_ids.clear();
            for (int p : FeatureClusterer.this.all_partitions) {
                this.clusters_per_partition[p].clear();
                this.markovs_per_partition[p].clear();


                // It's lame, but we need to put this here so that...
                this.costmodels_per_partition[p] = new MarkovCostModel(catalogContext, p_estimator, this.t_estimators_per_partition[p], thresholds);
            } // FOR
            
            // Reset Counters
            for (int i = 0; i < this.c_counters.length; i++) {
                this.c_counters[i] = 0;
                this.t_counters[i] = 0;
            } // FOR
        }
    }
    
    
    // ----------------------------------------------------------------------------
    // DATA MEMBERS
    // ----------------------------------------------------------------------------


    /**
     * We also maintain a "global" MarkovGraphContainer that consumes all transactions
     * We will use to compare whether our cluster-specific models do better than the global one
     * This is automatically connected to the FeatureClusterer's base partition cache, so we
     * don't have to do anything special to get out what we need here
     */
    private final MarkovGraphsContainer global_markov = new MarkovGraphsContainer() {
        public MarkovGraph getFromParams(long txn_id, int base_partition, Object[] params, Procedure catalog_proc) {
            return (this.get(base_partition, catalog_proc));
        };
    };
    
    /**
     * Global Cost Model
     */
    private final MarkovCostModel global_costmodel;
    /**
     * Global TransactionEstimator
     */
    private final MarkovEstimator global_t_estimator;
    
    /**
     * Global Counters
     */
    private double total_g_cost = 0.0d;
    private int g_counters[] = new int[] {
        0,      // Single-P
        0,      // Multi-P
        0,      // Known Clusters
    };
    
    private final Instances splits[] = new Instances[SplitType.values().length];
    private final double split_percentages[] = new double[SplitType.values().length];
    private final int split_counts[] = new int[SplitType.values().length];


    private final CatalogContext catalogContext;
    private final Procedure catalog_proc;
    private final Workload workload;
    private final EstimationThresholds thresholds;
    private final PartitionEstimator p_estimator;
    private final Random rand = new Random(); // FIXME
    private final PartitionSet all_partitions;
    private final int total_num_partitions;
    private final FastObjectPool<ExecutionState> state_pool = new FastObjectPool<ExecutionState>(new ExecutionStateFactory(this));


    /** We want to have just one thread pool for calculate threads */
    private final ExecutorService calculate_threadPool;
    
    private double round_topk = DEFAULT_ATTRIBUTESET_TOP_K;
    private int num_rounds = DEFAULT_NUM_ROUNDS;
    
    private final Map<Long, PartitionSet> cache_all_partitions = new HashMap<Long, PartitionSet>();
    private final Map<Long, Integer> cache_base_partition = new HashMap<Long, Integer>();




    /**
     * Constructor
     * @param catalogContext
     * @param catalog_proc
     * @param workload
     * @param correlations
     * @param all_partitions
     * @param num_threads
     */
    public FeatureClusterer(CatalogContext catalogContext, Procedure catalog_proc, Workload workload, PartitionSet all_partitions, int num_threads) {
        this.catalogContext = catalogContext;
        this.catalog_proc = catalog_proc;
        this.workload = workload;
        this.thresholds = new EstimationThresholds(); // FIXME
        this.p_estimator = new PartitionEstimator(catalogContext);
        this.all_partitions = all_partitions;
        this.total_num_partitions = catalogContext.numberOfPartitions;
        this.calculate_threadPool = Executors.newFixedThreadPool(num_threads);
        
        for (SplitType type : SplitType.values()) {
            this.split_percentages[type.ordinal()] = type.percentage;
        } // FOR
        
        this.global_t_estimator = new MarkovEstimator(this.catalogContext, this.p_estimator, this.global_markov);
        this.global_costmodel = new MarkovCostModel(catalogContext, this.p_estimator, this.global_t_estimator, this.thresholds);
        for (Integer p : FeatureClusterer.this.all_partitions) {
            this.global_markov.getOrCreate(p, FeatureClusterer.this.catalog_proc).initialize();
        } // FOR
    }


    /**
     * Constructor
     */
    public FeatureClusterer(CatalogContext catalogContext, Procedure catalog_proc, Workload workload, PartitionSet all_partitions) {
        this(catalogContext, catalog_proc, workload, all_partitions, DEFAULT_NUM_THREADS);
    }
    
    protected final void cleanup() {
//        this.generate_threadPool.shutdownNow();
        this.calculate_threadPool.shutdownNow();
    }
    
    public void setNumRounds(int numRounds) {
        this.num_rounds = numRounds;
        if (debug.val) LOG.debug("Number of Rounds: " + numRounds);
    }
    public void setSplitPercentage(SplitType type, double percentage) {
        this.split_percentages[type.ordinal()] = percentage;
        if (debug.val) LOG.debug(String.format("%s Split Percentage: ", type.name(), percentage));
    }
    public void setAttributeTopK(double topk) {
        this.round_topk = topk;
        if (debug.val) LOG.debug("Attribute Top-K: " + topk);
    }
    
    protected MarkovCostModel getGlobalCostModel() {
        return this.global_costmodel;
    }
    protected MarkovGraphsContainer getGlobalMarkovGraphs() {
        return this.global_markov;
    }
    protected int[] getGlobalCounters() {
        return (this.g_counters);
    }
    
    /**
     * 
     * @param data
     * @return
     */
    protected Instances[] splitWorkload(Instances data) {
        int offset = 0;
        int all_cnt = data.numInstances();
        for (SplitType stype : SplitType.values()) {
            int idx = stype.ordinal();
            this.split_counts[idx] = (int)Math.round(all_cnt * stype.percentage);
            
            try {
                this.splits[idx] = new Instances(data, offset, this.split_counts[idx]);
            
                // Apply NumericToNominal filter!
                NumericToNominal filter = new NumericToNominal();
                filter.setInputFormat(this.splits[idx]);
                this.splits[idx] = Filter.useFilter(this.splits[idx], filter);
                
            } catch (Exception ex) {
                throw new RuntimeException("Failed to split " + stype + " workload", ex);
            }
            
            offset += this.split_counts[idx];
            if (debug.val) LOG.debug(String.format("%-12s%d", stype.toString()+":", this.split_counts[idx]));
        } // FOR
        return (this.splits);
    }


    // ----------------------------------------------------------------------------
    // CACHING METHODS
    // ----------------------------------------------------------------------------
    
    private int getBasePartition(TransactionTrace txn_trace) {
        Long txn_id = Long.valueOf(txn_trace.getTransactionId());
        Integer base_partition = this.cache_base_partition.get(txn_id);
        if (base_partition == null) {
            try {
                base_partition = this.p_estimator.getBasePartition(txn_trace);
            } catch (Exception ex) {
                throw new RuntimeException(ex);
            }
            this.cache_base_partition.put(txn_id, base_partition);
        }
        return (base_partition.intValue());
    }
    
    private PartitionSet getAllPartitions(TransactionTrace txn_trace) {
        Long txn_id = Long.valueOf(txn_trace.getTransactionId());
        PartitionSet all_partitions = this.cache_all_partitions.get(txn_id);
        if (all_partitions == null) {
            all_partitions = new PartitionSet();
            try {
                this.p_estimator.getAllPartitions(all_partitions, txn_trace);
            } catch (Exception ex) {
                throw new RuntimeException(ex);
            }
            this.cache_all_partitions.put(txn_id, all_partitions);
        }
        return (all_partitions);
    }
    
    // ----------------------------------------------------------------------------
    // CALCULATION METHODS
    // ----------------------------------------------------------------------------
    
    /**
     * 
     * @param fset
     * @param data
     * @param catalog_proc
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    protected MarkovAttributeSet calculate(final Instances data) throws Exception {


        // ----------------------------------------------------------------------------
        // Split the input data set into separate data sets
        // ----------------------------------------------------------------------------
        if (debug.val) LOG.debug(String.format("Splitting %d instances", data.numInstances()));
        this.splitWorkload(data);


        // ----------------------------------------------------------------------------
        // Calculate global information
        // ----------------------------------------------------------------------------
        if (debug.val) LOG.debug("Calculating Global MarkovGraph cost");
        this.calculateGlobalCost();
        
        // ----------------------------------------------------------------------------
        // Perform Feed-Forward Selection
        // ----------------------------------------------------------------------------
        Attribute base_partition_attr = data.attribute(FeatureUtil.getFeatureKeyPrefix(BasePartitionFeature.class));
        assert(base_partition_attr != null);
        Integer base_partition_idx = base_partition_attr.index();
        assert(base_partition_idx != null);


        // Get the list of all the attributes that we are going to want to try to cluster on
        // We want to always remove the first attribute because that's the TransactionId
        List<Attribute> temp = (List<Attribute>)CollectionUtil.addAll(new ArrayList<Attribute>(), data.enumerateAttributes());
        
        // Remove the TransactionId and BasePartition features
        temp.remove(FeatureExtractor.TXNID_ATTRIBUTE_IDX);
        temp.remove(base_partition_idx);


        Collections.shuffle(temp, this.rand);
        ListOrderedSet<Attribute> all_attributes = new ListOrderedSet<Attribute>();
        all_attributes.addAll(temp);
        
        // List of all AttributeSets ever created
        final SortedSet<MarkovAttributeSet> all_asets = new TreeSet<MarkovAttributeSet>();


        // The AttributeSets created in each round
        final SortedSet<MarkovAttributeSet> round_asets = new TreeSet<MarkovAttributeSet>();
        final Map<MarkovAttributeSet, AbstractClusterer> round_clusterers = new HashMap<MarkovAttributeSet, AbstractClusterer>();


        // The best AttributeSet + Clusterer we've seen thus far
        MarkovAttributeSet best_aset = null;
        AbstractClusterer best_clusterer = null;
        boolean found_new_best = true;
        
        int round = 0;
        while (round++ < this.num_rounds && found_new_best) {
            round_asets.clear();
            round_clusterers.clear();
            
            if (debug.val) {
                Map<String, Object> m0 = new ListOrderedMap<String, Object>();
                m0.put("Round #", String.format("%02d", round));
                m0.put("Number of Partitions", this.all_partitions.size());
                m0.put("Number of Attributes", all_attributes.size());
                m0.put("Best Set", best_aset);
                m0.put("Best Cost", (best_aset != null ? best_aset.getCost() : null));
                
                Map<String, Object> m1 = new ListOrderedMap<String, Object>();
                for (SplitType stype : SplitType.values()) {
                    String key = String.format("# of %s Instances", stype.name());
                    String val = String.format("%-8s [%.02f]", this.split_counts[stype.ordinal()], stype.percentage);
                    m1.put(key, val);    
                } // FOR
                
                LOG.debug("\n" + StringUtil.formatMaps(":", true, true, false, false, true, true, m0, m1));
            }


            final Iterable<Set<Attribute>> it = UniqueCombinationIterator.factory(all_attributes, round);
            final List<Set<Attribute>> sets = (List<Set<Attribute>>)CollectionUtil.addAll(new ArrayList<Set<Attribute>>(), it);


            final int num_sets = sets.size();
            final CountDownLatch latch = new CountDownLatch(num_sets);
            final AtomicInteger aset_ctr = new AtomicInteger(0);
            
            for (final Set<Attribute> s : sets) {
                Runnable r = new Runnable() {
                    @Override
                    public void run() {
                        MarkovAttributeSet aset = new MarkovAttributeSet(s);
                        AbstractClusterer clusterer = null;
//                        if (aset_ctr.get() <= 0) {
                            if (trace.val) LOG.trace("Constructing AttributeSet: " + aset);
                            try {
                                clusterer = FeatureClusterer.this.calculateAttributeSetCost(aset);
                            } catch (Exception ex) {
                                LOG.fatal("Failed to calculate MarkovAttributeSet cost for " + aset, ex);
                                throw new RuntimeException(ex);
                            }
                            assert(aset != null);
                            assert(clusterer != null);
                            round_asets.add(aset);
                            round_clusterers.put(aset, clusterer);
                            all_asets.add(aset);
                            if (debug.val) {
                                int my_ctr = aset_ctr.getAndIncrement();
                                LOG.debug(String.format("[%03d] %s => %.03f", my_ctr, aset, aset.getCost()));
                            }
//                        }
                        latch.countDown();
                        
                    }
                };
                this.calculate_threadPool.execute(r);
            } // FOR
            
            // Wait until they all finish
            if (debug.val) LOG.debug(String.format("Waiting for %d calculateAttributeSetCosts threads to finish", num_sets));
            latch.await();
            
            // Now figure out what the top-k MarkovAttributeSets from this round
            // For now we'll explode out all of the attributes that they contain and throw that into a set
            // of candidate attributes for the next round
            all_attributes.clear();
            int top_k = (int)Math.round(round_asets.size() * this.round_topk);
            for (MarkovAttributeSet aset : round_asets) {
                all_attributes.addAll(aset);
                if (debug.val) LOG.debug(String.format("%.03f\t%s", aset.getCost(), aset.toString()));
                if (top_k-- == 0) break;
            } // FOR
            // if (round == 1) all_attributes.add(data.attribute(1));
            
            MarkovAttributeSet round_best = round_asets.first();
            assert(round_best != null);
            if (best_aset == null || round_best.getCost() < best_aset.getCost()) {
                best_aset = round_best;
                best_clusterer = round_clusterers.get(round_best);
            } else {
                found_new_best = false;
            }
            
            if (debug.val) LOG.debug(String.format("Next Round Attributes [size=%d]: %s", all_attributes.size(), MarkovAttributeSet.toString(all_attributes)));
        } // WHILE (round)
        
        this.generateDecisionTree(best_clusterer, best_aset, data);
        
        return (best_aset);
    }


    /**
     * Calculate the cost of a global MarkovGraph estimator 
     * @throws Exception
     */
    protected void calculateGlobalCost() throws Exception {
        final Instances trainingData = this.splits[SplitType.TRAINING.ordinal()];
        assert(trainingData != null);
        final Instances validationData = this.splits[SplitType.VALIDATION.ordinal()];
        assert(validationData != null);
        
        // ----------------------------------------------------------------------------
        // BUILD GLOBAL MARKOVGRAPH
        // ----------------------------------------------------------------------------
        for (int i = 0, cnt = trainingData.numInstances(); i < cnt; i++) {
            // Grab the Instance and throw it at the the clusterer to get the target cluster
            // The original data set is going to have the txn id that we need to grab 
            // the proper TransactionTrace record from the workload
            Instance inst = trainingData.instance(i);
            long txn_id = FeatureUtil.getTransactionId(inst);
            TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
            assert(txn_trace != null) : "Invalid TxnId #" + txn_id + "\n" + inst;


            // Figure out which base partition this txn would execute on
            // because we want divide the MarkovGraphContainers by the base partition
            int base_partition = this.getBasePartition(txn_trace);
            
            // Update Global MarkovGraph
            MarkovGraph markov = this.global_markov.get(base_partition, this.catalog_proc);
            assert(markov != null) : "Failed to get Global MarkovGraph for partition #" + base_partition;
            markov.processTransaction(txn_trace, this.p_estimator);
        } // FOR


        // ----------------------------------------------------------------------------
        // BUILD GLOBAL COST MODELS
        // ----------------------------------------------------------------------------
        for (Integer partition : FeatureClusterer.this.all_partitions) {
            MarkovGraph m = this.global_markov.get(partition, this.catalog_proc);
            assert(m != null);
            m.calculateProbabilities(catalogContext.getAllPartitionIds());
            assert(m.isValid()) : "The MarkovGraph at Partition #" + partition + " is not valid!";
        } // FOR
        if (debug.val) LOG.debug(String.format("Finished initializing GLOBAL MarkovCostModel"));


        // ----------------------------------------------------------------------------
        // ESTIMATE GLOBAL COST
        // ----------------------------------------------------------------------------
        int validationCnt = validationData.numInstances();
        int recalculate_ctr = 0;
        for (int i = 0; i < validationCnt; i++) {
            if (trace.val && i > 0 && i % 1000 == 0) LOG.trace(String.format("TransactionTrace %d/%d", i, validationCnt));
            Instance inst = validationData.instance(i);
            long txn_id = FeatureUtil.getTransactionId(inst);
            TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
            assert(txn_trace != null);
            int base_partition = this.getBasePartition(txn_trace);


            // Skip any txn that executes on a partition that we're not evaluating
            if (this.all_partitions.contains(base_partition) == false) continue;


            // Ok so now let's figure out what this mofo is going to do...
            PartitionSet partitions = this.getAllPartitions(txn_trace);
            boolean singlepartitioned = (partitions.size() == 1);
            
            // Estimate Global MarkovGraph Cost
            double g_cost = this.global_costmodel.estimateTransactionCost(catalogContext, txn_trace);
            if (g_cost > 0) {
                this.total_g_cost += g_cost;
                this.g_counters[singlepartitioned ? 0 : 1]++;
             
                MarkovGraph m = this.global_markov.get(base_partition, this.catalog_proc);
                assert(m != null);
                m.processTransaction(txn_trace, p_estimator);
                // m.calculateProbabilities();
                recalculate_ctr++;
            }
        } // FOR
        if (debug.val) LOG.debug(String.format("Recalculated global probabilities %d out of %d times", recalculate_ctr, validationCnt));
    }
    
    protected Map<Integer, MarkovGraphsContainer> constructMarkovModels(MarkovAttributeSet aset, Instances data) throws Exception {
        
        // Create an ExecutionState for this run
        ExecutionState state = (ExecutionState)this.state_pool.borrowObject();
        state.init(this.createClusterer(aset, data));
        
        // Construct the MarkovGraphs for each Partition/Cluster using the Training Data Set
        this.generateMarkovGraphs(state, data);
        
        // Generate the MarkovModels for the different partitions+clusters
        this.generateMarkovCostModels(state);
        
        Map<Integer, MarkovGraphsContainer> ret = new HashMap<Integer, MarkovGraphsContainer>();
        for (int p = 0; p < state.markovs_per_partition.length; p++) {
            ret.put(p, state.markovs_per_partition[p]);
        } // FOR
        return (ret);
    }
    
    /**
     * 
     * @param catalog_proc
     * @param attributes
     * @param trainingData
     * @param validationData
     * @return
     * @throws Exception
     */
    public AbstractClusterer calculateAttributeSetCost(final MarkovAttributeSet aset) throws Exception {
        // Build our clusterer
        if (debug.val) LOG.debug("Training Clusterer - " + aset);
        AbstractClusterer clusterer = this.createClusterer(aset, this.splits[SplitType.TRAINING.ordinal()]);
        
        // Create an ExecutionState for this run
        ExecutionState state = (ExecutionState)this.state_pool.borrowObject();
        state.init(clusterer);
        
        // Construct the MarkovGraphs for each Partition/Cluster using the Training Data Set
        this.generateMarkovGraphs(state, this.splits[SplitType.TRAINING.ordinal()]);
        
        // Generate the MarkovModels for the different partitions+clusters
        this.generateMarkovCostModels(state);
        
        // Now we need a mapping from TransactionIds -> ClusterIds
        // And then calculate the cost of using our cluster configuration to predict txn paths
        double total_c_cost = 0.0d;
        
        int c_counters[] = state.c_counters;
        int t_counters[] = state.t_counters;
        
//        Map<Pair<Long, Integer>, Histogram> key_to_cluster = new TreeMap<Pair<Long, Integer>, Histogram>(); 
//        Map<Integer, Histogram> cluster_to_key = new TreeMap<Integer, Histogram>();
        
        Instances validationData = this.splits[SplitType.VALIDATION.ordinal()];
        int validationCnt = this.split_counts[SplitType.VALIDATION.ordinal()];
        
        if (debug.val) LOG.debug(String.format("Estimating prediction rates of clusterer with %d transactions...", validationCnt));
        for (int i = 0; i < validationCnt; i++) {
            if (i > 0 && i % 1000 == 0) LOG.trace(String.format("TransactionTrace %d/%d", i, validationCnt));
            
            Instance inst = validationData.instance(i);
            long txn_id = FeatureUtil.getTransactionId(inst);
            TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
            assert(txn_trace != null);
            Integer base_partition = this.getBasePartition(txn_trace);
            // Skip any txn that executes on a partition that we're not evaluating
            if (this.all_partitions.contains(base_partition) == false) continue;
            
            int c = (int)clusterer.clusterInstance(inst);


            // Debug Stuff
//            Pair<Long, Integer> key = Pair.of((Long)txn_trace.getParam(1), ((Object[])txn_trace.getParam(4)).length);
//            if (key_to_cluster.containsKey(key) == false) key_to_cluster.put(key, new Histogram());
//            key_to_cluster.get(key).put(c);
//            if (cluster_to_key.containsKey(c) == false) cluster_to_key.put(c, new Histogram());
//            cluster_to_key.get(c).put(key);
//            if (debug.val) LOG.debug(String.format("[%s, %s] => %d", , c));
            


            // Ok so now let's figure out what this mofo is going to do...
            PartitionSet partitions = this.getAllPartitions(txn_trace);
            boolean singlepartitioned = (partitions.size() == 1);
            t_counters[singlepartitioned ? 0 : 1]++;
            t_counters[2]++; // Total # of Txns
            
            // Estimate Clusterer MarkovGraphCost
            MarkovCostModel c_costmodel = state.costmodels_per_partition[base_partition.intValue()];
            double c_cost = 0.0;
            TxnToClusterMarkovGraphsContainer markovs = state.markovs_per_partition[base_partition.intValue()];
            markovs.addTransactionClusterXref(txn_id, c);
            MarkovGraph markov = markovs.get(c, catalog_proc);


            // Check that this is a cluster that we've seen before at this partition
            if (markov == null) {
                if (trace.val) LOG.warn(String.format("Txn #%d was mapped to never before seen Cluster #%d at partition %d", txn_id, c, base_partition));
                markov = markovs.getOrCreate(c, this.catalog_proc).initialize();
                markovs.addTransactionClusterXref(txn_id, c);
                // state.t_estimators_per_partition[base_partition.intValue()].processTransactionTrace(txn_trace);
                c_counters[2]++; // Unknown Clusters
            }
            c_cost = c_costmodel.estimateTransactionCost(catalogContext, txn_trace);
            if (c_cost > 0) {
                total_c_cost += c_cost;
                c_counters[singlepartitioned ? 0 : 1]++;
                
                // So that we can improve our predictions...
                markov.processTransaction(txn_trace, p_estimator);
                markov.calculateProbabilities(catalogContext.getAllPartitionIds());
                
//                if (c_counters[singlepartitioned ? 0 : 1] == 1) {
////                    MarkovPathEstimator.LOG.setLevel(Level.TRACE);
////                    MarkovPathEstimator estimator = new MarkovPathEstimator(markov, c_costmodel.getTransactionEstimator(c), base_partition, txn_trace.getParams());
////                    estimator.traverse(markov.getStartVertex());
////                    List<Vertex> e_path = estimator.getVisitPath();
//                    
//                    List<Vertex> e_path = c_costmodel.getLastEstimatedPath();
//                    List<Vertex> a_path = c_costmodel.getLastActualPath();
//                    for (int ii = 0, cnt = Math.max(e_path.size(), a_path.size()); ii < cnt; ii++) {
//                        Vertex e = (ii < e_path.size() ? e_path.get(ii) : null);
//                        Vertex a = (ii < a_path.size() ? a_path.get(ii) : null);
//                        String match = (e != null && e.equals(a) ? "" : "***");
//                        System.err.println(String.format("%-60s%-10s%s", e, match, a));
//                    } // FOR
//
//                    System.err.println("singlepartitioned = " + singlepartitioned);
//                    System.err.println("cost = " + c_cost);
//                    System.err.println("all_partitions             = " + all_partitions);
//                    System.err.println("actual partitions (R/W)    = " + c_costmodel.getReadWritePartitions(a_path));
//                    System.err.println("estimated partitions (R/W) = " + c_costmodel.getReadWritePartitions(e_path));
//                    System.err.println(txn_trace.debug(catalog_db));
//
//                    LOG.debug("Writing out mispredicated MarkovGraph paths [c_cost=" + c_cost + "]");
//                    GraphvizExport<Vertex, Edge> gv = MarkovUtil.exportGraphviz(markov, false, markov.getPath(c_costmodel.getLastEstimatedPath()));
//                    gv.highlightPath(markov.getPath(c_costmodel.getLastActualPath()), "blue");
//                    System.err.println("GRAPHVIZ: " + gv.writeToTempFile(catalog_proc, (singlepartitioned ? "single" : "multi")));
//                    System.err.println();
////                    System.exit(1);
//                    
//                    wrote_gv = true;
//                    // if (temp++ == 1) System.exit(1);
//                }
            }
        } // FOR
        
        if (debug.val) LOG.debug("Results: " + aset + "\n" + debugCounters(validationCnt, t_counters, c_counters, this.g_counters));
        
        this.state_pool.returnObject(state);
        
        aset.setCost(total_c_cost);
        return (clusterer);
    }


    /**
     * 
     * @param state
     * @param trainingData
     * @throws Exception
     */
    protected void generateMarkovGraphs(ExecutionState state, Instances trainingData) throws Exception {
        // Now iterate over validation set and construct Markov models
        // We have to know which field is our txn_id so that we can quickly access it
        int trainingCnt = trainingData.numInstances(); 
        if (trace.val) LOG.trace(String.format("Training MarkovGraphs using %d instances", trainingCnt));
        
        ObjectHistogram<Integer> cluster_h = new ObjectHistogram<Integer>();
        ObjectHistogram<Integer> partition_h = new ObjectHistogram<Integer>();
        for (int i = 0; i < trainingCnt; i++) {
            // Grab the Instance and throw it at the the clusterer to get the target cluster
            // The original data set is going to have the txn id that we need to grab 
            // the proper TransactionTrace record from the workload
            Instance inst = trainingData.instance(i);
            int c = (int)state.clusterer.clusterInstance(inst);
            cluster_h.put(c);
            
            long txn_id = Long.valueOf(inst.stringValue(FeatureExtractor.TXNID_ATTRIBUTE_IDX));
            TransactionTrace txn_trace = this.workload.getTransaction(txn_id);
            assert(txn_trace != null) : "Invalid TxnId #" + txn_id + "\n" + inst;


            // Figure out which base partition this txn would execute on
            // because we want divide the MarkovGraphContainers by the base partition
            int base_partition = this.p_estimator.getBasePartition(txn_trace);
            partition_h.put(base_partition);


            // Build up the MarkovGraph for this specific cluster
            MarkovGraphsContainer markovs = state.markovs_per_partition[base_partition];
            MarkovGraph markov = markovs.get(c, this.catalog_proc);
            if (markov == null) {
                markov = markovs.getOrCreate(c, this.catalog_proc).initialize();
                markovs.put(c, markov);
            }
            markov.processTransaction(txn_trace, this.p_estimator);
            
            state.clusters_per_partition[base_partition].put(c);
        } // FOR
        // if (trace.val) LOG.trace("Clusters per Partition:\n" + StringUtil.formatMaps(state.clusters_per_partition));
    }
    
    /**
     * 
     * @param state
     */
    protected void generateMarkovCostModels(final ExecutionState state) {
        // Now use the validation data set to figure out how well we are able to predict transaction
        // execution paths using the trained Markov graphs
        // We first need to construct a new costmodel and populate it with TransactionEstimators
        if (trace.val) LOG.trace("Constructing CLUSTER-BASED MarkovCostModels");
        
        // IMPORTANT: We run out of memory if we try to build the MarkovGraphs for all of the 
        // partitions+clusters. So instead we are going to randomly select some of the partitions to be used in the 
        // cost model estimation.
        final CountDownLatch costmodel_latch = new CountDownLatch(this.all_partitions.size());
        if (trace.val) LOG.trace(String.format("Generating MarkovGraphs for %d partitions", costmodel_latch.getCount()));
        
        for (final int partition : this.all_partitions) {
            final MarkovGraphsContainer markovs = state.markovs_per_partition[partition];
            if (trace.val) LOG.trace(String.format("Calculating Partition #%d probabilities for %d clusters", partition, markovs.size()));
            for (Entry<Integer, Map<Procedure, MarkovGraph>> e : markovs.entrySet()) {
                // if (debug.val) LOG.debug(String.format("Partition %d - Cluster %d", partition, i++));
                
                // Calculate the probabilities for each graph
                for (MarkovGraph markov : e.getValue().values()) {
                    markov.calculateProbabilities(catalogContext.getAllPartitionIds());
                } // FOR
            } // FOR
            if (trace.val) LOG.trace(String.format("Finished processing MarkovGraphs for Partition #%d [count=%d]", partition, costmodel_latch.getCount()));
            costmodel_latch.countDown();
            // this.generate_threadPool.execute(r);
        } // FOR
//        // Wait until everyone finishes
//        try {
//            costmodel_latch.await();
//        } catch (Exception ex) {
//            throw new RuntimeException(ex);
//        }
    }
    
    /**
     * 
     * @param trainingData
     * @param round
     * @throws Exception
     */
    protected AbstractClusterer createClusterer(MarkovAttributeSet aset, Instances trainingData) throws Exception {
        if (trace.val) LOG.trace(String.format("Clustering %d %s instances with %d attributes", trainingData.numInstances(), CatalogUtil.getDisplayName(catalog_proc), aset.size()));
        
        // Create the filter we need so that we only include the attributes in the given MarkovAttributeSet
        Filter filter = aset.createFilter(trainingData);
        
        // Using our training set to build the clusterer
        int seed = this.rand.nextInt(); 
//        SimpleKMeans inner_clusterer = new SimpleKMeans();
        EM inner_clusterer = new EM();
        String options[] = {
            "-N", Integer.toString(1000), // num_partitions),
            "-S", Integer.toString(seed),
            "-I", Integer.toString(100),
            
        };
        inner_clusterer.setOptions(options);
        
        FilteredClusterer filtered_clusterer = new FilteredClusterer();
        filtered_clusterer.setFilter(filter);
        filtered_clusterer.setClusterer(inner_clusterer);
        
        AbstractClusterer clusterer = filtered_clusterer;
        clusterer.buildClusterer(trainingData);
        
        return (clusterer);
    }
    
    protected Classifier generateDecisionTree(AbstractClusterer clusterer, MarkovAttributeSet aset, Instances data) throws Exception {
        // We need to create a new Attribute that has the ClusterId
        Instances newData = data; // new Instances(data);
        newData.insertAttributeAt(new Attribute("ClusterId"), newData.numAttributes());
        Attribute cluster_attr = newData.attribute(newData.numAttributes()-1);
        assert(cluster_attr != null);
        assert(cluster_attr.index() > 0);
        newData.setClass(cluster_attr);
        
        // We will then tell the Classifier to predict that ClusterId based on the MarkovAttributeSet
        ObjectHistogram<Integer> cluster_h = new ObjectHistogram<Integer>();
        for (int i = 0, cnt = newData.numInstances(); i < cnt; i++) {
            // Grab the Instance and throw it at the the clusterer to get the target cluster
            Instance inst = newData.instance(i);
            int c = (int)clusterer.clusterInstance(inst);
            inst.setClassValue(c);
            cluster_h.put(c);
        } // FOR
        System.err.println("Number of Elements: " + cluster_h.getValueCount());
        System.err.println(cluster_h);


        NumericToNominal filter = new NumericToNominal();
        filter.setInputFormat(newData);
        newData = Filter.useFilter(newData, filter);
        
        String output = this.catalog_proc.getName() + "-labeled.arff";
        FileUtil.writeStringToFile(output, newData.toString());
        LOG.info("Wrote labeled data set to " + output);
        
        // Decision Tree
        J48 j48 = new J48();
        String options[] = {
            "-S", Integer.toString(this.rand.nextInt()),
            
        };
        j48.setOptions(options);


        // Make sure we add the ClusterId attribute to a new MarkovAttributeSet so that
        // we can tell the Classifier to classify that!
        FilteredClassifier fc = new FilteredClassifier();
        MarkovAttributeSet classifier_aset = new MarkovAttributeSet(aset);
        classifier_aset.add(cluster_attr);
        fc.setFilter(classifier_aset.createFilter(newData));
        fc.setClassifier(j48);
        
        // Bombs away!
        fc.buildClassifier(newData);
        
        return (fc);
    }
    
    
    /**
     * Helper method to convet Feature keys to Attributes
     * @param data
     * @param prefixes
     * @return
     */
    public static Set<Attribute> prefix2attributes(Instances data, String...prefixes) {
        Set<Attribute> attributes = new ListOrderedSet<Attribute>();
        for (String key : prefixes) {
            Attribute attribute = data.attribute(key);
            assert(attribute != null) : "Invalid Attribute key '" + key + "'";
            attributes.add(attribute);
        } // FOR
        return (attributes);
    }


    protected static String debugCounters(int validationCnt, int t_counters[], int c_counters[], int g_counters[]) {
        int values[][] = new int[][]{
            t_counters,
            c_counters,
            g_counters,
        };
        String labels[] = {
            "Prediction Result",
            "Single-Partition",
            "Multi-Partition",
            "Unknown Clusters",
        };
        int totals[] = {
            t_counters[2],
            t_counters[0],
            t_counters[1],
            t_counters[2],
        };


        final int total_txns = values[0][2];
        final int value_len = Integer.toString(total_txns).length();
        final String f = "%" + value_len + "d / %" + value_len + "d [%.03f]";
        final ListOrderedMap<?, ?> maps[] = new ListOrderedMap<?, ?>[values.length];
        for (int i = 0; i < values.length; i++) {
            ListOrderedMap<String, String> m = new ListOrderedMap<String, String>();
            int singlep = values[i][0];
            int multip = values[i][1];
            int missed = values[i][2];
            
            if (i == 0) {
                m.put("# of Evaluated Transactions", String.format(f, total_txns, validationCnt, (total_txns / (double)validationCnt)));
            } else {
                String prefix = (i == 1 ? "Clusterer" : "Global");
                int inner[] = new int[]{
                    singlep + multip,
                    singlep,
                    multip,
                    total_txns - missed,
                };
                for (int ii = 0; ii < inner.length; ii++) {
                    String value = String.format(f, totals[ii] - inner[ii],                // Count
                                                    totals[ii],                            // Total
                                                    1.0 - (inner[ii] / (double)totals[ii]) // Percentage
                    );
                    m.put(prefix + " " + labels[ii], value);
                } // FOR
            }
            maps[i] = m;
        } // FOR
        
        return (StringUtil.formatMaps(maps));
    }
    
    /**
     * Main!
     * @param vargs
     * @throws Exception
     */
    public static void main(String[] vargs) throws Exception {
        ArgumentsParser args = ArgumentsParser.load(vargs);
        args.require(
            ArgumentsParser.PARAM_CATALOG,
            ArgumentsParser.PARAM_WORKLOAD,
            ArgumentsParser.PARAM_MAPPINGS
        );
        
        // Number of threads
        int num_threads = FeatureClusterer.DEFAULT_NUM_THREADS;
        if (args.hasIntParam(ArgumentsParser.PARAM_MARKOV_THREADS)) {
            num_threads = args.getIntParam(ArgumentsParser.PARAM_MARKOV_THREADS);
        }
        
        // Get the procedure we're suppose to investigate
        String proc_name = args.getOptParam(0);
        Procedure catalog_proc = args.catalog_db.getProcedures().getIgnoreCase(proc_name);
        assert(catalog_proc != null) : proc_name;
        
        // And our Weka data file
//        File arff_path = new File(args.getOptParam(1));
//        assert(arff_path.exists()) : arff_path.getAbsolutePath();
//        BufferedReader reader = new BufferedReader(new FileReader(arff_path));
//        Instances data = new Instances(reader);
//        reader.close();
//        data = new Instances(data, 0, args.workload.getTransactionCount());
        
        Instances data = null;
        {
            // Hopefully this will get garbage collected if we put it here...
            FeatureExtractor fextractor = new FeatureExtractor(args.catalogContext);
            Map<Procedure, FeatureSet> fsets = fextractor.calculate(args.workload);
            FeatureSet fset = fsets.get(catalog_proc);
            assert(fset != null) : "Failed to get FeatureSet for " + catalog_proc;
            data = fset.export(catalog_proc.getName());
        }
        assert(data != null);
        assert(args.workload.getTransactionCount() == data.numInstances());
        
        PartitionSet partitions = null;
        if (args.hasParam(ArgumentsParser.PARAM_WORKLOAD_RANDOM_PARTITIONS)) {
            PartitionEstimator p_estimator = new PartitionEstimator(args.catalogContext);
            final ObjectHistogram<Integer> h = new ObjectHistogram<Integer>();
            for (TransactionTrace txn_trace : args.workload.getTransactions()) {
                int base_partition = p_estimator.getBasePartition(txn_trace);
                h.put(base_partition);
            } // FOR
//            System.err.println("# OF PARTITIONS: " + h.getValueCount());
//            h.setKeepZeroEntries(true);
//            for (Integer p : CatalogUtil.getAllPartitionIds(args.catalog_db)) {
//                if (h.contains(p) == false) h.put(p, 0);
//            }
//            System.err.println(h);
//            System.exit(1);
//            
            partitions = new PartitionSet(h.values());
        } else {
            partitions = args.catalogContext.getAllPartitionIds();    
        }
        FeatureClusterer fclusterer = new FeatureClusterer(args.catalogContext,
                                                           catalog_proc,
                                                           args.workload,
                                                           partitions,
                                                           num_threads);
        // Update split configuration variables
        for (SplitType type : SplitType.values()) {
            String param_name = String.format("%s.%s", ArgumentsParser.PARAM_MARKOV_SPLIT, type.name());
            if (args.hasDoubleParam(param_name) == false) continue;
            double percentage = args.getDoubleParam(param_name);
            fclusterer.setSplitPercentage(type, percentage);
        } // FOR
        if (args.hasDoubleParam(ArgumentsParser.PARAM_MARKOV_TOPK)) {
            fclusterer.setAttributeTopK(args.getDoubleParam(ArgumentsParser.PARAM_MARKOV_TOPK));
        }
        if (args.hasIntParam(ArgumentsParser.PARAM_MARKOV_ROUNDS)) {
            fclusterer.setNumRounds(args.getIntParam(ArgumentsParser.PARAM_MARKOV_ROUNDS));
        }


//      MarkovAttributeSet aset = fclusterer.calculate(data);
        
        // HACK
        Set<Attribute> attributes = FeatureClusterer.prefix2attributes(data,
            "ParamArrayLength-04"
//            "ParamHashPartition-01"
        );
        MarkovAttributeSet aset = new MarkovAttributeSet(attributes);
        Map<Integer, MarkovGraphsContainer> markovs = fclusterer.constructMarkovModels(aset, data);
        
        File output = new File(catalog_proc.getName() + ".markovs");
        MarkovGraphsContainerUtil.save(markovs, output);
        
//        fclusterer.calculateGlobalCost();
//        AbstractClusterer clusterer = fclusterer.calculateAttributeSetCost(aset);
//        fclusterer.generateDecisionTree(clusterer, aset, data);
//        
//        System.err.println(aset + "\nCost: " + aset.getCost());
        fclusterer.cleanup();
    }
    
}
Source Code of edu.brown.markov.FeatureClusterer

Related Classes of edu.brown.markov.FeatureClusterer