/***************************************************************************
* Copyright (C) 2011 by H-Store Project *
* Brown University *
* Massachusetts Institute of Technology *
* Yale University *
* *
* Permission is hereby granted, free of charge, to any person obtaining *
* a copy of this software and associated documentation files (the *
* "Software"), to deal in the Software without restriction, including *
* without limitation the rights to use, copy, modify, merge, publish, *
* distribute, sublicense, and/or sell copies of the Software, and to *
* permit persons to whom the Software is furnished to do so, subject to *
* the following conditions: *
* *
* The above copyright notice and this permission notice shall be *
* included in all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, *
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF *
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR *
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, *
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR *
* OTHER DEALINGS IN THE SOFTWARE. *
***************************************************************************/
package edu.brown.hstore;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.log4j.Logger;
import org.voltdb.CatalogContext;
import org.voltdb.ParameterSet;
import org.voltdb.SQLStmt;
import org.voltdb.catalog.CatalogMap;
import org.voltdb.catalog.PlanFragment;
import org.voltdb.catalog.Procedure;
import org.voltdb.catalog.Statement;
import org.voltdb.exceptions.MispredictionException;
import edu.brown.hstore.Hstoreservice.WorkFragment;
import edu.brown.hstore.conf.HStoreConf;
import edu.brown.interfaces.DebugContext;
import edu.brown.logging.LoggerUtil;
import edu.brown.logging.LoggerUtil.LoggerBoolean;
import edu.brown.plannodes.PlanNodeUtil;
import edu.brown.profilers.BatchPlannerProfiler;
import edu.brown.profilers.ProfileMeasurementUtil;
import edu.brown.statistics.FastIntHistogram;
import edu.brown.statistics.Histogram;
import edu.brown.utils.PartitionEstimator;
import edu.brown.utils.PartitionSet;
import edu.brown.utils.StringUtil;
import edu.uci.ics.jung.graph.DirectedSparseMultigraph;
/**
* @author pavlo
*/
public class BatchPlanner {
private static final Logger LOG = Logger.getLogger(BatchPlanner.class);
private static final LoggerBoolean debug = new LoggerBoolean();
private static final LoggerBoolean trace = new LoggerBoolean();
static {
LoggerUtil.attachObserver(LOG, debug, trace);
}
// ----------------------------------------------------------------------------
// STATIC DATA MEMBERS
// ----------------------------------------------------------------------------
private static final int FIRST_DEPENDENCY_ID = 1;
/**
* If the unique dependency ids option is enabled, all input/output
* DependencyIds for WorkFragments will be globally unique.
* @see HStoreConf.SiteConf.planner_unique_dependency_ids
*/
private static final AtomicInteger NEXT_DEPENDENCY_ID = new AtomicInteger(FIRST_DEPENDENCY_ID);
/**
* Cached set of PlanFragment -> Set<PartitionIds>
*/
private static Map<Statement, Map<PlanFragment, PartitionSet>> CACHED_FRAGMENT_PARTITION_MAPS[];
// ----------------------------------------------------------------------------
// GLOBAL DATA MEMBERS
// ----------------------------------------------------------------------------
private final HStoreConf hstore_conf;
protected final CatalogContext catalogContext;
protected final Procedure catalog_proc;
protected final Statement catalog_stmts[];
private final boolean stmt_is_readonly[];
private final boolean stmt_is_replicatedonly[];
private final List<PlanFragment> sorted_singlep_fragments[];
private final List<PlanFragment> sorted_multip_fragments[];
private final int batchSize;
private final int nonReplicatedStmtCount;
private final PartitionEstimator p_estimator;
private BatchPlan plan;
private final Map<Integer, PlanGraph> plan_graphs = new HashMap<Integer, PlanGraph>();
private final Map<Integer, WorkFragment.Builder> round_builders = new HashMap<Integer, WorkFragment.Builder>();
private final boolean enable_unique_ids;
private final boolean force_singlePartition;
private boolean prefetch = false;
private final Map<Integer, Set<PlanVertex>> output_dependency_xref = new HashMap<Integer, Set<PlanVertex>>();
private final List<Integer> output_dependency_xref_clear = new ArrayList<Integer>();
private final List<PlanVertex> sorted_vertices = new ArrayList<PlanVertex>();
// FAST SINGLE-PARTITION LOOKUP CACHE
private final boolean cache_isSinglePartition[];
private final int cache_fastLookups[][];
private final BatchPlan cache_singlePartitionPlans[];
private Map<Statement, Map<PlanFragment, PartitionSet>> cache_singlePartitionFragmentPartitions;
// PROFILING
private BatchPlannerProfiler profiler;
// ----------------------------------------------------------------------------
// INTERNAL PLAN GRAPH ELEMENTS
// ----------------------------------------------------------------------------
protected static class PlanVertex { // extends AbstractVertex {
final PlanFragment catalog_frag;
final int frag_id;
final int stmt_index;
final int round;
final int input_dependency_id;
final int output_dependency_id;
final int hash_code;
final boolean read_only;
public PlanVertex(PlanFragment catalog_frag, int stmt_index, int round, int input_dependency_id,
int output_dependency_id, boolean is_local) {
// super(catalog_frag);
this.catalog_frag = catalog_frag;
this.frag_id = catalog_frag.getId();
this.stmt_index = stmt_index;
this.round = round;
this.input_dependency_id = input_dependency_id;
this.output_dependency_id = output_dependency_id;
this.read_only = catalog_frag.getReadonly();
this.hash_code = this.frag_id | this.round << 20 | this.stmt_index << 26;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof PlanVertex))
return (false);
return (this.hash_code == ((PlanVertex) obj).hash_code);
}
@Override
public int hashCode() {
return (this.hash_code);
}
@Override
public String toString() {
return String.format("<FragId=%02d, StmtIndex=%02d, Round=%02d, Input=%02d, Output=%02d>", this.frag_id,
this.stmt_index, this.round, this.input_dependency_id, this.output_dependency_id);
}
} // END CLASS
protected static class PlanGraph extends DirectedSparseMultigraph<PlanVertex, Integer> {
private static final long serialVersionUID = 1L;
/**
* The number of dispatch rounds that we have in this plan
*/
private int num_rounds = 0;
private PlanVertex sorted_vertices[];
/**
* Single-Partition
*/
private long fragmentIds[];
private int input_ids[];
private int output_ids[];
public PlanGraph() {
// super(catalog_db);
}
} // END CLASS
// ----------------------------------------------------------------------------
// BATCH PLAN
// ----------------------------------------------------------------------------
/**
* BatchPlan
*/
public class BatchPlan {
// ----------------------------------------------------------------------------
// INVOCATION DATA MEMBERS
// ----------------------------------------------------------------------------
private boolean cached = false;
private int base_partition = HStoreConstants.NULL_PARTITION_ID;
private PlanGraph graph;
private MispredictionException mispredict;
/** Temporary buffer space for sorting the PlanFragments per Statement */
private final List<PlanFragment> frag_list[];
/** Round# -> Map{PartitionId, Set{PlanFragments}} **/
private final Collection<PlanVertex> rounds[][];
private int rounds_length;
/**
* StmtIndex -> Target Partition Ids
*/
private final PartitionSet[] stmt_partitions;
private final PartitionSet[] stmt_partitions_swap;
/**
* StmtIndex -> Map{PlanFragment, Set<PartitionIds>}
*/
private final Map<PlanFragment, PartitionSet> frag_partitions[];
private final Map<PlanFragment, PartitionSet> frag_partitions_swap[];
/**
* A bitmap of whether each query at the given index in the batch was single-partitioned or not
*/
private final boolean singlepartition_bitmap[];
/**
* Whether the fragments of this batch plan consist of read-only operations
**/
protected boolean readonly = true;
/**
* Whether the batch plan can all be executed locally
*/
protected boolean all_local = true;
/**
* Whether the fragments in the batch plan can be executed on a single site
*/
protected boolean all_singlepartitioned = true;
/** check if all local fragment work is non-transactional **/
// protected boolean localFragsAreNonTransactional = true;
/**
* Default Constructor Must call init() before this BatchPlan can be used
*/
@SuppressWarnings("unchecked")
public BatchPlan(int max_round_size) {
int batch_size = BatchPlanner.this.batchSize;
int num_partitions = BatchPlanner.this.catalogContext.numberOfPartitions;
// Round Data
this.rounds = (Collection<PlanVertex>[][]) new Collection<?>[max_round_size][];
for (int i = 0; i < this.rounds.length; i++) {
this.rounds[i] = (Collection<PlanVertex>[]) new Collection<?>[num_partitions];
// These lists will only be allocated when needed
} // FOR
// Batch Data
this.frag_list = (List<PlanFragment>[]) new List<?>[batch_size];
this.stmt_partitions = new PartitionSet[batch_size];
this.stmt_partitions_swap = new PartitionSet[batch_size];
this.frag_partitions = (Map<PlanFragment, PartitionSet>[]) new HashMap<?, ?>[batch_size];
this.frag_partitions_swap = (Map<PlanFragment, PartitionSet>[]) new HashMap<?, ?>[batch_size];
this.singlepartition_bitmap = new boolean[batch_size];
for (int i = 0; i < batch_size; i++) {
this.stmt_partitions[i] = new PartitionSet();
this.frag_partitions[i] = new HashMap<PlanFragment, PartitionSet>();
} // FOR
}
/**
* @param base_partition
* @param txn_id
* @param batchSize
*/
private BatchPlan init(int base_partition) {
assert (this.cached == false);
this.base_partition = base_partition;
this.mispredict = null;
this.readonly = true;
this.all_local = true;
this.all_singlepartitioned = true;
for (int i = 0; i < this.frag_list.length; i++) {
if (this.frag_list[i] != null)
this.frag_list[i] = null;
if (this.stmt_partitions[i] != null && this.stmt_partitions_swap[i] == null)
this.stmt_partitions[i].clear();
} // FOR
for (int i = 0; i < this.rounds.length; i++) {
for (int ii = 0; ii < this.rounds[i].length; ii++) {
if (this.rounds[i][ii] != null) this.rounds[i][ii].clear();
} // FOR
} // FOR
return (this);
}
protected BatchPlanner getPlanner() {
return (BatchPlanner.this);
}
protected PlanGraph getPlanGraph() {
return (this.graph);
}
/**
* Returns true if this txn was hit by a MispredictionException when we were
* constructing this batch plan.
*/
public boolean hasMisprediction() {
return (this.mispredict != null);
}
/**
* Returns the MispredictionException for this batch plan.
*/
public MispredictionException getMisprediction() {
return (this.mispredict);
}
/**
* Convert this batch plan into a list of WorkFragment builders.
* The stmtCounters is a list of the number of times that we have executed each
* query in the past for this transaction. The offset of each element in stmtCounters
* corresponds to the stmtIndex in the SQLStmt batch.
* @param txn_id
* @param stmtCounters
* @param builders
*/
public void getWorkFragmentsBuilders(Long txn_id, int[] stmtCounters, List<WorkFragment.Builder> builders) {
BatchPlanner.this.createWorkFragmentsBuilders(txn_id, this, stmtCounters, builders);
}
protected int getBatchSize() {
return (BatchPlanner.this.batchSize);
}
public int getFragmentCount() {
return (this.graph.fragmentIds.length);
}
public long[] getFragmentIds() {
return (this.graph.fragmentIds);
}
public int[] getOutputDependencyIds() {
return (this.graph.output_ids);
}
public int[] getInputDependencyIds() {
return (this.graph.input_ids);
}
/**
* Return an array of PartitionSets where each element in the array
* corresponds to the partitions that the SQLStmt in the batch will need
* to execute on.
* @return
*/
public final PartitionSet[] getStatementPartitions() {
return (this.stmt_partitions);
}
public boolean isReadOnly() {
return (this.readonly);
}
public boolean isLocal() {
return (this.all_local);
}
public boolean isSingleSited() {
return (this.all_singlepartitioned);
}
public boolean isSingledPartitionedAndLocal() {
return (this.all_singlepartitioned && this.all_local);
}
public boolean isCached() {
return (this.cached);
}
@Override
public String toString() {
Map<String, Object> m = new LinkedHashMap<String, Object>();
m.put("Read Only", this.readonly);
m.put("All Local", this.all_local);
m.put("All Single-Partitioned", this.all_singlepartitioned);
return StringUtil.formatMaps(m);
}
} // END CLASS
/**
* Testing Constructor
* The batchSize is assumed to be the length of batchStmts
* @param batchStmts
* @param catalog_proc
* @param p_estimator
*/
public BatchPlanner(SQLStmt[] batchStmts, Procedure catalog_proc, PartitionEstimator p_estimator) {
this(batchStmts, batchStmts.length, catalog_proc, p_estimator, false);
}
/**
* Testing constructor where the planner is forced to choose single-partition queries
* @param batchStmts
* @param catalog_proc
* @param p_estimator
* @param forceSinglePartition
*/
protected BatchPlanner(SQLStmt[] batchStmts, Procedure catalog_proc, PartitionEstimator p_estimator,
boolean forceSinglePartition) {
this(batchStmts, batchStmts.length, catalog_proc, p_estimator, forceSinglePartition);
}
/**
* Constructor
*
* @param batchStmts
* @param batchSize
* @param catalog_proc
* @param p_estimator
* @param local_partition
*/
public BatchPlanner(SQLStmt[] batchStmts, int batchSize, Procedure catalog_proc, PartitionEstimator p_estimator) {
this(batchStmts, batchSize, catalog_proc, p_estimator, false);
}
/**
* Full Constructor
*
* @param batchStmts
* @param batchSize
* @param catalog_proc
* @param p_estimator
* @param forceSinglePartition
*/
@SuppressWarnings("unchecked")
public BatchPlanner(SQLStmt[] batchStmts,
int batchSize,
Procedure catalog_proc,
PartitionEstimator p_estimator,
boolean forceSinglePartition) {
assert (catalog_proc != null);
assert (p_estimator != null);
this.hstore_conf = HStoreConf.singleton();
this.catalog_proc = catalog_proc;
this.catalogContext = p_estimator.getCatalogContext();
this.batchSize = batchSize;
this.p_estimator = p_estimator;
this.plan = new BatchPlan(hstore_conf.site.planner_max_round_size);
this.force_singlePartition = forceSinglePartition;
this.enable_unique_ids = hstore_conf.site.planner_unique_dependency_ids;
this.sorted_singlep_fragments = (List<PlanFragment>[]) new List<?>[this.batchSize];
this.sorted_multip_fragments = (List<PlanFragment>[]) new List<?>[this.batchSize];
this.catalog_stmts = new Statement[this.batchSize];
this.stmt_is_readonly = new boolean[this.batchSize];
this.stmt_is_replicatedonly = new boolean[this.batchSize];
this.cache_isSinglePartition = (hstore_conf.site.planner_caching ? new boolean[this.batchSize] : null);
this.cache_fastLookups = (hstore_conf.site.planner_caching ? new int[this.batchSize][] : null);
this.cache_singlePartitionPlans = (hstore_conf.site.planner_caching ? new BatchPlan[this.catalogContext.numberOfPartitions] : null);
int nonReplicatedStmtCnt = 0;
for (int i = 0; i < this.batchSize; i++) {
this.catalog_stmts[i] = batchStmts[i].getStatement();
this.stmt_is_readonly[i] = batchStmts[i].getStatement().getReadonly();
this.stmt_is_replicatedonly[i] = batchStmts[i].getStatement().getReplicatedonly() ||
batchStmts[i].getStatement().getSecondaryindex();
if (this.stmt_is_replicatedonly[i] == false) nonReplicatedStmtCnt++;
if (trace.val)
LOG.trace(String.format("INIT[%d] %s -> isReplicatedOnly[%s]",
i, this.catalog_stmts[i].fullName(), this.stmt_is_replicatedonly[i]));
// CACHING
// Since most batches are going to be single-partition, we will cache the
// parameter offsets on how to determine whether a Statement is multi-partition or not
if (hstore_conf.site.planner_caching) {
this.cache_fastLookups[i] = p_estimator.getStatementEstimationParameters(this.catalog_stmts[i]);
if (trace.val)
LOG.trace(String.format("INIT[%d] %s Cached Fast-Lookup: %s",
i, this.catalog_stmts[i].fullName(), Arrays.toString(this.cache_fastLookups[i])));
}
} // FOR
this.nonReplicatedStmtCount = nonReplicatedStmtCnt;
// Static Cache Members
if (CACHED_FRAGMENT_PARTITION_MAPS == null) {
synchronized (BatchPlanner.class) {
if (CACHED_FRAGMENT_PARTITION_MAPS == null)
BatchPlanner.clear(this.catalogContext.numberOfPartitions);
} // SYNCH
}
}
/**
* Clear out internal cache
*
* @param num_partitions
* The total number of partitions in the cluster
*/
@SuppressWarnings("unchecked")
public static synchronized void clear(int num_partitions) {
CACHED_FRAGMENT_PARTITION_MAPS = (Map<Statement, Map<PlanFragment, PartitionSet>>[]) new Map<?, ?>[num_partitions];
for (int i = 0; i < num_partitions; i++) {
CACHED_FRAGMENT_PARTITION_MAPS[i] = new HashMap<Statement, Map<PlanFragment, PartitionSet>>();
} // FOR
}
public Procedure getProcedure() {
return this.catalog_proc;
}
public Statement[] getStatements() {
return this.catalog_stmts;
}
public void setPrefetchFlag(boolean val) {
this.prefetch = val;
}
/**
* Return the Statement within this batch at the given offset
*
* @return
*/
public Statement getStatement(int idx) {
return this.catalog_stmts[idx];
}
public int getBatchSize() {
return (this.batchSize);
}
/**
* Generate a new BatchPlan for a batch of queries requested by the txn
*
* @param txn_id
* @param base_partition
* @param predict_partitions
* @param touched_partitions
* @param batchArgs
* @return
*/
public BatchPlan plan(final Long txn_id,
final int base_partition,
final PartitionSet predict_partitions,
final FastIntHistogram touched_partitions,
final ParameterSet[] batchArgs) {
final boolean predict_singlePartitioned = (predict_partitions.size() == 1);
if (hstore_conf.site.planner_profiling) {
if (this.profiler == null)
this.profiler = new BatchPlannerProfiler();
this.profiler.plan_time.start();
this.profiler.transactions.incrementAndGet();
}
if (debug.val) {
LOG.debug(String.format("Constructing a new %s BatchPlan for %s txn #%d",
this.catalog_proc.getName(),
(predict_singlePartitioned ? "single-partition" : "distributed"), txn_id));
if (trace.val) {
Map<String, Object> m = new LinkedHashMap<String, Object>();
m.put("Batch Size", this.batchSize);
for (int i = 0; i < this.batchSize; i++) {
String key = String.format("[%02d] %s", i, this.catalog_stmts[i].getName());
m.put(key, Arrays.toString(batchArgs[i].toArray()));
}
LOG.trace("Query Batch Dump\n" + StringUtil.formatMapsBoxed(m));
}
}
// OPTIMIZATION: Check whether we can use a cached single-partition BatchPlan
if (this.force_singlePartition || this.cache_fastLookups != null) {
boolean is_allSinglePartition = true;
// OPTIMIZATION: Skip all of this if we know that we're always
// suppose to be single-partitioned
if (this.force_singlePartition == false) {
for (int stmt_index = 0; stmt_index < this.batchSize; stmt_index++) {
// If we don't have a cached fast-lookup here, then we need to check
// whether the statement is accessing only replicated tables and is read-only
if (this.cache_fastLookups[stmt_index] == null) {
if (this.stmt_is_replicatedonly[stmt_index] && this.stmt_is_readonly[stmt_index]) {
if (debug.val)
LOG.debug(String.format("[#%d-%02d] No fast look-ups for %s but stmt is replicated + read-only.",
txn_id, stmt_index, this.catalog_stmts[stmt_index].fullName()));
this.cache_isSinglePartition[stmt_index] = true;
}
else {
if (debug.val)
LOG.debug(String.format("[#%d-%02d] No fast look-ups for %s. Cache is marked as not single-partitioned",
txn_id, stmt_index, this.catalog_stmts[stmt_index].fullName()));
this.cache_isSinglePartition[stmt_index] = false;
}
}
// Otherwise, we'll use our fast look-ups to check to make sure that the
// statement's input parameters match the txn's base partition
else {
if (debug.val)
LOG.debug(String.format("[#%d-%02d] Using fast-lookup caching for %s: %s", txn_id,
stmt_index, this.catalog_stmts[stmt_index].fullName(),
Arrays.toString(this.cache_fastLookups[stmt_index])));
Object params[] = batchArgs[stmt_index].toArray();
this.cache_isSinglePartition[stmt_index] = true;
for (int idx : this.cache_fastLookups[stmt_index]) {
int hash = p_estimator.getHasher().hash(params[idx], this.catalog_stmts[stmt_index]);
if (hash != base_partition) {
if (debug.val)
LOG.debug(String.format("[#%d-%02d] Failed to match cached partition info for %s at idx=%d: " +
"hash[%d] != basePartition[%d]",
txn_id, stmt_index, this.catalog_stmts[stmt_index].fullName(), idx,
hash, base_partition));
this.cache_isSinglePartition[stmt_index] = false;
break;
}
} // FOR
}
if (trace.val)
LOG.trace(String.format("[#%d-%02d] cache_isSinglePartition[%s] = %s",
txn_id, stmt_index,
this.catalog_stmts[stmt_index].fullName(), this.cache_isSinglePartition[stmt_index]));
is_allSinglePartition = is_allSinglePartition && this.cache_isSinglePartition[stmt_index];
} // FOR (Statement)
}
if (trace.val)
LOG.trace(String.format("[#%d] is_allSinglePartition=%s", txn_id, is_allSinglePartition));
// If all of the Statements are single-partition, then we can use
// the cached BatchPlan if we already have one.
// This saves a lot of trouble
if (is_allSinglePartition && this.cache_singlePartitionPlans[base_partition] != null) {
if (debug.val)
LOG.debug(String.format("[#%d] Using cached BatchPlan at partition #%02d: %s", txn_id,
base_partition, Arrays.toString(this.catalog_stmts)));
if (hstore_conf.site.planner_profiling && profiler != null) {
profiler.plan_time.stop();
profiler.cached.incrementAndGet();
}
touched_partitions.put(base_partition, this.nonReplicatedStmtCount);
return (this.cache_singlePartitionPlans[base_partition]);
}
}
// Otherwise we have to construct a new BatchPlan
this.plan.init(base_partition);
// Only maintain the histogram of what partitions were touched if we
// know that we're going to throw a MispredictionException
Histogram<Integer> mispredict_h = null;
boolean mispredict = false;
// ----------------------------------------------------------------------------
// MAIN LOGIC LOOP
// This is where we go through each SQLStmt in the batch and figure out
// what partitions it needs to touch.
// ----------------------------------------------------------------------------
for (int stmt_index = 0; stmt_index < this.batchSize; stmt_index++) {
final Statement catalog_stmt = this.catalog_stmts[stmt_index];
assert (catalog_stmt != null) :
String.format("The Statement at index %d is null for %s",
stmt_index, this.catalog_proc);
final Object params[] = batchArgs[stmt_index].toArray();
if (trace.val)
LOG.trace(String.format("[#%d-%02d] Calculating touched partitions plans for %s",
txn_id, stmt_index, catalog_stmt.fullName()));
Map<PlanFragment, PartitionSet> frag_partitions = plan.frag_partitions[stmt_index];
PartitionSet stmt_all_partitions = plan.stmt_partitions[stmt_index];
boolean has_singlepartition_plan = catalog_stmt.getHas_singlesited();
boolean is_replicated_only = this.stmt_is_replicatedonly[stmt_index];
boolean is_read_only = this.stmt_is_readonly[stmt_index];
boolean is_singlePartition = has_singlepartition_plan;
boolean is_local = true;
CatalogMap<PlanFragment> fragments = null;
// OPTIMIZATION: Fast partition look-up caching
// OPTIMIZATION: Read-only queries on replicated tables always just
// go to the local partition
// OPTIMIZATION: If we're force to be single-partitioned, pretend
// that the table is replicated
if ((this.cache_isSinglePartition != null && this.cache_isSinglePartition[stmt_index]) ||
(is_replicated_only && is_read_only) ||
(this.force_singlePartition)) {
if (trace.val) {
if (this.cache_isSinglePartition[stmt_index]) {
LOG.trace(String.format("[#%d-%02d] Using fast-lookup for %s. " +
"Skipping PartitionEstimator",
txn_id, stmt_index, catalog_stmt.fullName()));
} else {
LOG.trace(String.format("[#%d-%02d] %s is read-only and replicate-only." +
"Skipping PartitionEstimator",
txn_id, stmt_index, catalog_stmt.fullName()));
}
}
assert (has_singlepartition_plan);
if (this.cache_singlePartitionFragmentPartitions == null) {
this.cache_singlePartitionFragmentPartitions = CACHED_FRAGMENT_PARTITION_MAPS[base_partition];
}
Map<PlanFragment, PartitionSet> cached_frag_partitions = this.cache_singlePartitionFragmentPartitions.get(catalog_stmt);
if (cached_frag_partitions == null) {
cached_frag_partitions = new HashMap<PlanFragment, PartitionSet>();
PartitionSet p = this.catalogContext.getPartitionSetSingleton(base_partition);
for (PlanFragment catalog_frag : catalog_stmt.getFragments().values()) {
cached_frag_partitions.put(catalog_frag, p);
} // FOR
this.cache_singlePartitionFragmentPartitions.put(catalog_stmt, cached_frag_partitions);
}
if (plan.stmt_partitions_swap[stmt_index] == null) {
plan.stmt_partitions_swap[stmt_index] = plan.stmt_partitions[stmt_index];
plan.frag_partitions_swap[stmt_index] = plan.frag_partitions[stmt_index];
}
stmt_all_partitions = plan.stmt_partitions[stmt_index] = this.catalogContext.getPartitionSetSingleton(base_partition);
frag_partitions = plan.frag_partitions[stmt_index] = cached_frag_partitions;
}
// Otherwise figure out whether the query can execute as
// single-partitioned or not
else {
if (debug.val)
LOG.debug(String.format("[#%d-%02d] Computing touched partitions %s in txn #%d", txn_id,
stmt_index, catalog_stmt.fullName(), txn_id));
if (plan.stmt_partitions_swap[stmt_index] != null) {
stmt_all_partitions = plan.stmt_partitions[stmt_index] = plan.stmt_partitions_swap[stmt_index];
plan.stmt_partitions_swap[stmt_index] = null;
stmt_all_partitions.clear();
frag_partitions = plan.frag_partitions[stmt_index] = plan.frag_partitions_swap[stmt_index];
plan.frag_partitions_swap[stmt_index] = null;
}
try {
// OPTIMIZATION: If we were told that the transaction is suppose to be
// single-partitioned, then we will throw the single-partitioned PlanFragments
// at the PartitionEstimator to get back what partitions each PlanFragment
// will need to go to. If we get multiple partitions, then we know that we
// mispredicted and we should throw a MispredictionException
// If we originally didn't predict that it was single-partitioned, then we
// actually still need to check whether the query should be single-partitioned or not.
// This is because a query may actually just want to execute on just one
// partition (note that it could be a local partition or the remote partition).
// We'll assume that it's single-partition <<--- Can we cache that??
while (true) {
if (is_singlePartition == false) stmt_all_partitions.clear();
fragments = (is_singlePartition ? catalog_stmt.getFragments() : catalog_stmt.getMs_fragments());
if (debug.val)
LOG.debug(String.format("[#%d-%02d] Estimating for %d %s-partition fragments",
txn_id, stmt_index, fragments.size(),
(is_singlePartition ? "single" : "multi")));
// PARTITION ESTIMATOR
if (hstore_conf.site.planner_profiling && profiler != null)
ProfileMeasurementUtil.swap(profiler.plan_time, profiler.partest_time);
this.p_estimator.getAllFragmentPartitions(frag_partitions,
stmt_all_partitions,
fragments.values(),
params,
base_partition);
if (hstore_conf.site.planner_profiling && profiler != null)
ProfileMeasurementUtil.swap(profiler.partest_time, profiler.plan_time);
int stmt_all_partitions_size = stmt_all_partitions.size();
if (is_singlePartition && stmt_all_partitions_size > 1) {
// If this was suppose to be multi-partitioned, then
// we want to stop right here!!
if (predict_singlePartitioned) {
if (trace.val)
LOG.trace(String.format("Mispredicted txn #%d - Multiple Partitions %s",
txn_id, stmt_all_partitions));
mispredict = true;
break;
}
// Otherwise we can let it wrap back around and construct the fragment
// mapping for the multi-partition PlanFragments
is_singlePartition = false;
continue;
}
is_local = (stmt_all_partitions_size == 1 && stmt_all_partitions.contains(base_partition));
if (is_local == false && predict_singlePartitioned) {
// Again, this is not what was suppose to happen!
if (trace.val)
LOG.trace(String.format("Mispredicted txn #%d - Remote Partitions %s",
txn_id, stmt_all_partitions));
mispredict = true;
break;
} else if (predict_partitions.containsAll(stmt_all_partitions) == false) {
// Again, this is not what was suppose to happen!
if (trace.val)
LOG.trace(String.format("Mispredicted txn #%d - Unallocated Partitions %s / %s",
txn_id, stmt_all_partitions, predict_partitions));
mispredict = true;
break;
}
// Score! We have a plan that works!
break;
} // WHILE
// Bad Mojo!
} catch (Exception ex) {
String msg = "";
for (int i = 0; i < this.batchSize; i++) {
msg += String.format("[#%d-%02d] %s %s\n%5s\n", txn_id, i, catalog_stmt.fullName(),
catalog_stmt.getSqltext(), Arrays.toString(batchArgs[i].toArray()));
} // FOR
LOG.fatal("\n" + msg);
throw new RuntimeException("Unexpected error when planning " + catalog_stmt.fullName(), ex);
}
}
if (debug.val)
LOG.debug(String.format("[#%d-%02d] is_singlepartition=%s, partitions=%s",
txn_id, stmt_index, is_singlePartition, stmt_all_partitions));
// Get a sorted list of the PlanFragments that we need to execute
// for this query
if (is_singlePartition) {
if (this.sorted_singlep_fragments[stmt_index] == null) {
this.sorted_singlep_fragments[stmt_index] = PlanNodeUtil.getSortedPlanFragments(catalog_stmt, true);
}
plan.frag_list[stmt_index] = this.sorted_singlep_fragments[stmt_index];
// Only mark that we touched these partitions if the Statement
// is not on a replicated table or it's not read-only
if (is_replicated_only == false || is_read_only == false) {
touched_partitions.put(stmt_all_partitions.get());
}
}
// Distributed Query
else {
if (this.sorted_multip_fragments[stmt_index] == null) {
this.sorted_multip_fragments[stmt_index] = PlanNodeUtil.getSortedPlanFragments(catalog_stmt, false);
}
plan.frag_list[stmt_index] = this.sorted_multip_fragments[stmt_index];
// Always mark that we are touching these partitions
touched_partitions.put(stmt_all_partitions.values());
// Note that will want to update is_singlePartitioned here for non-readonly replicated
// querys when we have a one partition cluster because those queries don't have
// single-partition query plans
// if (this.num_partitions == 1 && is_replicated_only && is_read_only == false) {
// is_singlePartition = true;
// }
}
plan.readonly = plan.readonly && catalog_stmt.getReadonly();
plan.all_singlepartitioned = plan.all_singlepartitioned && is_singlePartition;
plan.all_local = plan.all_local && is_local;
// Keep track of whether the current query in the batch was
// single-partitioned or not
plan.singlepartition_bitmap[stmt_index] = is_singlePartition;
// Misprediction!!
if (mispredict) {
// If this is the first Statement in the batch that hits the mispredict,
// then we need to create the histogram and populate it with the
// partitions from the previous queries
int start_idx = stmt_index;
if (mispredict_h == null) {
mispredict_h = new FastIntHistogram();
start_idx = 0;
}
for (int i = start_idx; i <= stmt_index; i++) {
if (debug.val)
LOG.debug(String.format("Pending mispredict for txn #%d. " +
"Checking whether to add partitions for batch statement %02d",
txn_id, i));
// Make sure that we don't count the local partition if it
// was reading a replicated table.
if (this.stmt_is_replicatedonly[i] == false ||
(this.stmt_is_replicatedonly[i] && this.stmt_is_readonly[i] == false)) {
if (trace.val)
LOG.trace(String.format("%s touches non-replicated table. " +
"Including %d partitions in mispredict histogram for txn #%d",
this.catalog_stmts[i].fullName(), plan.stmt_partitions[i].size(), txn_id));
mispredict_h.put(plan.stmt_partitions[i]);
}
} // FOR
continue;
}
// ----------------------
// DEBUG DUMP
// ----------------------
if (debug.val) {
List<PlanFragment> _fragments = null;
if (is_singlePartition && this.sorted_singlep_fragments[stmt_index] != null) {
_fragments = this.sorted_singlep_fragments[stmt_index];
} else {
_fragments = this.sorted_multip_fragments[stmt_index];
}
Map<?, ?> maps[] = new Map[_fragments.size() + 1];
int ii = 0;
for (PlanFragment catalog_frag : _fragments) {
Map<String, Object> m = new LinkedHashMap<String, Object>();
PartitionSet p = plan.frag_partitions[stmt_index].get(catalog_frag);
boolean frag_local = (p.size() == 1 && p.contains(base_partition));
m.put(String.format("[%02d] Fragment", ii), catalog_frag.fullName());
m.put(String.format(" Partitions"), p);
m.put(String.format(" IsLocal"), frag_local);
ii++;
maps[ii] = m;
} // FOR
Map<String, Object> header = new LinkedHashMap<String, Object>();
header.put("Batch Statement", String.format("#%d / %d", stmt_index, this.batchSize));
header.put("Catalog Statement", catalog_stmt.fullName());
header.put("Statement SQL", catalog_stmt.getSqltext());
header.put("All Partitions", plan.stmt_partitions[stmt_index]);
header.put("Local Partition", base_partition);
header.put("IsSingledPartitioned", is_singlePartition);
header.put("IsStmtLocal", is_local);
header.put("IsReplicatedOnly", is_replicated_only);
header.put("IsBatchLocal", plan.all_local);
header.put("Fragments", _fragments.size());
maps[0] = header;
LOG.debug(String.format("[#%d-%02d]\n%s", txn_id, stmt_index, StringUtil.formatMapsBoxed(maps)));
}
} // FOR (Statement)
// Check whether we have an existing graph exists for this batch
// configuration
// This is the only place where we need to synchronize
int bitmap_hash = Arrays.hashCode(plan.singlepartition_bitmap);
PlanGraph graph = this.plan_graphs.get(bitmap_hash);
if (graph == null) { // assume fast case
graph = this.buildPlanGraph(plan);
this.plan_graphs.put(bitmap_hash, graph);
}
plan.graph = graph;
plan.rounds_length = graph.num_rounds;
if (hstore_conf.site.planner_profiling && profiler != null)
profiler.plan_time.stop();
// Create the MispredictException if any Statement in the loop above hit
// it. We don't want to throw it because whoever called us may want to look
// at the plan first
if (mispredict_h != null) {
plan.mispredict = new MispredictionException(txn_id, mispredict_h);
if (debug.val)
LOG.warn(String.format("Created %s for txn #%d\n%s",
plan.mispredict.getClass().getSimpleName(), txn_id,
plan.mispredict.getPartitions()));
}
// If this a single-partition plan and we have caching enabled, we'll
// add this to our cached listing. We'll mark it as cached so that it is never
// returned back to the BatchPlan object pool
else if (this.cache_singlePartitionPlans != null &&
this.cache_singlePartitionPlans[base_partition] == null &&
this.plan.isSingledPartitionedAndLocal()) {
this.cache_singlePartitionPlans[base_partition] = plan;
this.plan.cached = true;
this.plan = new BatchPlan(hstore_conf.site.planner_max_round_size);
return this.cache_singlePartitionPlans[base_partition];
}
if (debug.val)
LOG.debug(String.format("Created BatchPlan for txn #%d:\n%s", txn_id, this.plan.toString()));
return (this.plan);
}
/**
* Utility method for converting a BatchPlan into WorkFragment.Builders.
* The stmtCounters is a list of the number of times that we have executed each
* query in the past for this transaction. The offset of each element in stmtCounters
* corresponds to the stmtIndex in the SQLStmt batch.
* @param txn_id
* @param plan
* @param stmtCounters
* @param graph
* @param builders
*/
protected void createWorkFragmentsBuilders(final Long txn_id,
final BatchPlanner.BatchPlan plan,
final int[] stmtCounters,
final List<WorkFragment.Builder> builders) {
if (hstore_conf.site.planner_profiling && profiler != null)
profiler.fragment_time.start();
if (debug.val)
LOG.debug(String.format("Constructing list of WorkFragments to execute " +
"[txn_id=#%d, base_partition=%d]",
txn_id, plan.base_partition));
// 2013-05-14: I feel like that we could probably cache this somehow...
for (PlanVertex v : plan.graph.sorted_vertices) {
int stmt_index = v.stmt_index;
for (int partition : plan.frag_partitions[stmt_index].get(v.catalog_frag).values()) {
if (plan.rounds[v.round][partition] == null) {
plan.rounds[v.round][partition] = new ArrayList<PlanVertex>();
}
plan.rounds[v.round][partition].add(v);
} // FOR
} // FOR
// The main idea of what we're trying to do here is to group together
// all of the PlanFragments with the same input dependency ids into a single WorkFragment
if (trace.val)
LOG.trace("Generated " + plan.rounds_length + " rounds of tasks for txn #" + txn_id);
for (int round = 0; round < plan.rounds_length; round++) {
if (trace.val) LOG.trace(String.format("Txn #%d - Round %02d", txn_id, round));
for (int partition = 0; partition < this.catalogContext.numberOfPartitions; partition++) {
Collection<PlanVertex> vertices = plan.rounds[round][partition];
if (vertices == null || vertices.isEmpty()) continue;
this.round_builders.clear();
for (PlanVertex v : vertices) { // Does this order matter?
// Check whether we can use an existing WorkFragment builder
WorkFragment.Builder partitionBuilder = this.round_builders.get(v.input_dependency_id);
if (partitionBuilder == null) {
partitionBuilder = WorkFragment.newBuilder().setPartitionId(partition);
this.round_builders.put(v.input_dependency_id, partitionBuilder);
partitionBuilder.setReadOnly(true);
partitionBuilder.setPrefetch(this.prefetch);
}
// Fragment Id
partitionBuilder.addFragmentId(v.frag_id);
// Not all fragments will have an input dependency so this
// could be the NULL_DEPENDENCY_ID
partitionBuilder.addInputDepId(v.input_dependency_id);
if (v.input_dependency_id != HStoreConstants.NULL_DEPENDENCY_ID) {
partitionBuilder.setNeedsInput(true);
}
// All fragments will produce some output
partitionBuilder.addOutputDepId(v.output_dependency_id);
// SQLStmt Counter
partitionBuilder.addStmtCounter(stmtCounters[v.stmt_index]);
// SQLStmt Index
partitionBuilder.addStmtIndex(v.stmt_index);
// SQLStmt Ignore
// This query was already dispatched for prefetching, so we
// actually don't want to really execute it.
partitionBuilder.addStmtIgnore(false);
// ParameterSet Index
partitionBuilder.addParamIndex(v.stmt_index);
// Read-Only
if (v.read_only == false) {
partitionBuilder.setReadOnly(v.read_only);
}
if (trace.val)
LOG.trace(String.format("Fragment Grouping %d => " +
"[txnId=#%d, partition=%d, fragDd=%d, input=%d, output=%d, stmtIndex=%d]",
partitionBuilder.getFragmentIdCount(),
txn_id, partition, v.frag_id,
v.input_dependency_id, v.output_dependency_id, v.stmt_index));
} // FOR (frag_idx)
for (WorkFragment.Builder builder : this.round_builders.values()) {
int fragmentCount = builder.getFragmentIdCount();
if (fragmentCount == 0) {
if (trace.val) {
LOG.warn(String.format("For some reason we thought it would be a good idea to " +
"construct a %s with no fragments! [txn_id=#%d]",
WorkFragment.class.getSimpleName(), txn_id));
LOG.warn("In case you were wondering, this is a terrible idea, which is why we didn't do it!");
}
continue;
}
assert(builder.getOutputDepIdCount() == fragmentCount) :
"OutputDepId:" + builder.getOutputDepIdCount() + "!=" + fragmentCount;
assert(builder.getInputDepIdCount() == fragmentCount) :
"InputDepId:" + builder.getInputDepIdCount() + "!=" + fragmentCount;
assert(builder.getParamIndexCount() == fragmentCount) :
"ParamIndex:" + builder.getParamIndexCount() + "!=" + fragmentCount;
assert(builder.getStmtCounterCount() == fragmentCount) :
"StmtCounter:" + builder.getStmtCounterCount() + "!=" + fragmentCount;
assert(builder.getStmtIndexCount() == fragmentCount) :
"StmtIndex:" + builder.getStmtIndexCount() + "!=" + fragmentCount;
assert(builder.getStmtIgnoreCount() == fragmentCount) :
"StmtIgnore:" + builder.getStmtIgnoreCount() + "!=" + fragmentCount;
builders.add(builder);
} // FOR
// if (debug.val) {
// LOG.debug(String.format("New WorkFragment to run at partition #%d with %d fragments for txn #%d "
// +
// "[ids=%s, inputs=%s, outputs=%s]",
// partition, num_frags, txn_id,
// Arrays.toString(frag_ids), Arrays.toString(input_ids),
// Arrays.toString(output_ids)));
// if (trace.val)
// LOG.trace("Fragment Contents: [txn_id=#" + txn_id + "]\n" +
// task.toString());
// }
} // PARTITION
} // ROUND
assert (builders.size() > 0) : "Failed to generate any WorkFragments in this BatchPlan for txn #" + txn_id;
if (debug.val)
LOG.debug("Created " + builders.size() + " WorkFragment(s) for txn #" + txn_id);
if (hstore_conf.site.planner_profiling && profiler != null)
profiler.fragment_time.stop();
}
/**
* Construct
*
* @param plan
* @return
*/
protected PlanGraph buildPlanGraph(BatchPlanner.BatchPlan plan) {
if (hstore_conf.site.planner_profiling && profiler != null)
ProfileMeasurementUtil.swap(profiler.plan_time, profiler.graph_time);
PlanGraph graph = new PlanGraph();
this.sorted_vertices.clear();
this.output_dependency_xref_clear.clear();
int last_id = FIRST_DEPENDENCY_ID;
for (int stmt_index = 0; stmt_index < this.batchSize; stmt_index++) {
Map<PlanFragment, PartitionSet> frag_partitions = plan.frag_partitions[stmt_index];
assert (frag_partitions != null) : "No Fragment->PartitionIds map for Statement #" + stmt_index;
List<PlanFragment> fragments = plan.frag_list[stmt_index];
assert (fragments != null);
int num_fragments = fragments.size();
graph.num_rounds = Math.max(num_fragments, graph.num_rounds);
// Generate the synthetic DependencyIds for the query
int last_output_id = HStoreConstants.NULL_DEPENDENCY_ID;
for (int round = 0, cnt = num_fragments; round < cnt; round++) {
PlanFragment catalog_frag = fragments.get(round);
PartitionSet f_partitions = frag_partitions.get(catalog_frag);
assert (f_partitions != null) :
String.format("No PartitionIds for [%02d] %s in Statement #%d", round,
catalog_frag.fullName(), stmt_index);
boolean f_local = (f_partitions.size() == 1 && f_partitions.contains(plan.base_partition));
final Integer output_id = Integer.valueOf(this.enable_unique_ids ?
BatchPlanner.NEXT_DEPENDENCY_ID.getAndIncrement() : last_id++);
PlanVertex v = new PlanVertex(catalog_frag,
stmt_index,
round,
last_output_id,
output_id.intValue(),
f_local);
Set<PlanVertex> dependencies = output_dependency_xref.get(output_id);
if (dependencies == null) {
dependencies = new HashSet<PlanVertex>();
this.output_dependency_xref.put(output_id, dependencies);
} else if (this.output_dependency_xref_clear.contains(output_id) == false) {
dependencies.clear();
this.output_dependency_xref_clear.add(output_id);
}
dependencies.add(v);
graph.addVertex(v);
this.sorted_vertices.add(v);
last_output_id = output_id;
}
} // FOR
// Setup Edges
for (PlanVertex v0 : graph.getVertices()) {
if (v0.input_dependency_id == HStoreConstants.NULL_DEPENDENCY_ID)
continue;
for (PlanVertex v1 : output_dependency_xref.get(v0.input_dependency_id)) {
assert (!v0.equals(v1)) : v0;
if (!graph.findEdgeSet(v0, v1).isEmpty())
continue;
graph.addEdge(v0.input_dependency_id, v0, v1);
} // FOR
} // FOR
// Single-Partition Cache
Collections.sort(this.sorted_vertices, PLANVERTEX_COMPARATOR);
final int num_vertices = this.sorted_vertices.size();
graph.fragmentIds = new long[num_vertices];
graph.input_ids = new int[num_vertices];
graph.output_ids = new int[num_vertices];
int i = 0;
for (PlanVertex v : this.sorted_vertices) {
graph.fragmentIds[i] = v.frag_id;
graph.output_ids[i] = v.output_dependency_id;
graph.input_ids[i] = v.input_dependency_id;
i += 1;
} // FOR
graph.sorted_vertices = this.sorted_vertices.toArray(new PlanVertex[0]);
if (hstore_conf.site.planner_profiling && profiler != null)
ProfileMeasurementUtil.swap(profiler.graph_time, profiler.plan_time);
return (graph);
}
private static Comparator<PlanVertex> PLANVERTEX_COMPARATOR = new Comparator<PlanVertex>() {
@Override
public int compare(PlanVertex o1, PlanVertex o2) {
if (o1.stmt_index != o2.stmt_index)
return (o1.stmt_index - o2.stmt_index);
if (o1.round != o2.round)
return (o1.round - o2.round);
return (o1.frag_id - o2.frag_id);
}
};
// ----------------------------------------------------------------------------
// DEBUG METHODS
// ----------------------------------------------------------------------------
public class Debug implements DebugContext {
public BatchPlan getBatchPlan() {
return (plan);
}
public BatchPlannerProfiler getProfiler() {
return (profiler);
}
public boolean isCachedReplicatedOnly(int stmt_index) {
return (stmt_is_replicatedonly[stmt_index]);
}
public boolean isCachedReadOnly(int stmt_index) {
return (stmt_is_readonly[stmt_index]);
}
public int[] getCachedLookup(int stmt_index) {
return (cache_fastLookups[stmt_index]);
}
public BatchPlan getCachedSinglePartitionPlan(int stmt_index) {
return (cache_singlePartitionPlans[stmt_index]);
}
}
private Debug cachedDebugContext;
public Debug getDebugContext() {
if (this.cachedDebugContext == null) {
// We don't care if we're thread-safe here...
this.cachedDebugContext = new Debug();
}
return this.cachedDebugContext;
}
}