/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.planner;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.voltdb.VoltType;
import org.voltdb.catalog.Column;
import org.voltdb.expressions.AbstractExpression;
import org.voltdb.expressions.ConstantValueExpression;
import org.voltdb.expressions.ParameterValueExpression;
import org.voltdb.expressions.TupleValueExpression;
import org.voltdb.planner.parseinfo.StmtSubqueryScan;
import org.voltdb.planner.parseinfo.StmtTableScan;
import org.voltdb.plannodes.SchemaColumn;
/**
* Represents the partitioning of the data underlying a statement.
* In the simplest case, this is pre-determined by the single-partition context of the statement
* from a stored procedure annotation or a single-statement procedure attribute.
* In the more interesting ad hoc case, a user can specify that a statement be run on all partitions,
* but the semantics of the statement may indicate that the same result could be produced more optimally
* by running it on a single partition selected based on the hash of some partition key value,
* whether a statement parameter or a constant in the text of the statement.
* These cases arise both in queries and in (partitioned table) DML.
* As a multi-partition statement is analyzed in the planner, this object is filled in with details
* regarding its suitability for running correctly on a single partition.
*
* For a multi-fragment plan that contains a join,
* is it better to send partitioned tuples and join them on the coordinator
* or is it better to join them before sending?
* If bandwidth (or capacity of the receiving temp table) were the primary concern,
* a decision could be based on
* A) how much wider the joined rows are than the pre-joined rows.
* B) the expected yield of the join filtering -- does each pre-joined row typically
* match and get joined with multiple partner rows or does it typically fail to match
* any row.
* The statistics required to determine "B" are not generally available.
* In any case, there are two over-arching concerns.
* One is the correct handling of a special case
* -- a join of partitioned tables on their partition keys.
* In this case, the join MUST happen on each partition prior to sending any tuples.
* This restriction stems directly from the limitation that there can only be two fragments in a plan,
* and that a fragment produces a single (intermediate or final) result table.
* The "coordinator" receives the (one) intermediate result table and produces
* the final result table. It can not receive tuples from two different partitioned tables.
* The second over-arching consideration is that there is an optimization available to the
* transaction processor for the special case in which a coordinator fragment does not need to
* access any persistent local data (I learned this second hand from Izzy. --paul).
* This provides further motivation to do all scanning and joining in the collector fragment
* prior to sending tuples.
*
* These two considerations normally override all others,
* so that all multi-partition plans only "send after all joins", regardless of bandwidth/capacity
* considerations, but there remains some edge cases in which the decision MUST go the other way,
* that is, sending tuples prior to joining on the coordinator.
* This occurs for some OUTER JOINS between a replicated OUTER table and a partitioned INNER table as in:
*
* SELECT * FROM replicated R LEFT JOIN partitioned P ON ...;
*
* See the comment in SelectSubPlanAssembler.getSelectSubPlanForJoin
*/
public class StatementPartitioning implements Cloneable{
/**
* This value is only meaningful if m_inferPartitioning is false.
* It can be set true to force single-partition statement planning and
* to forbid single-partition planning/execution of replicated table DML.
* Since that would corrupt the replication, it is flagged as an error.
* Otherwise, no attempt is made to validate that a single partition statement would
* have the same result as the same query run on all partitions.
* It is up to the user to decide whether that is an issue.
* It can be set to false to force multi-partition statement planning.
* This MAY involve sub-optimal dispatch of fragments to partitions with no matching data.
* Currently, even inserts into partitioned tables are allowed to successfully execute
* on "wrong" partitions, but they are prevented at the lowest level from taking effect there.
*/
private final boolean m_forceSP;
/**
* Enables inference of single partitioning from statement.
*/
private final boolean m_inferPartitioning;
/*
* For partitioned table DML, caches the partitioning column for later matching with its prospective value.
* If that value is constant or a parameter, SP is an option.
*/
private Column m_partitionColForDML; // Not used in SELECT plans.
/*
* For a multi-partition statement that can definitely be run SP, this is a constant partitioning key value
* inferred from the analysis (suitable for hashinating).
* If null, SP may not be safe, or the partitioning may be based on something less obvious like a parameter or constant expression.
*/
private Object m_inferredValue = null;
private int m_inferredParameterIndex = -1;
/*
* Any constant/parameter-based expressions found to be equality-filtering partitioning columns.
*/
private final Set<AbstractExpression> m_inferredExpression = new HashSet<AbstractExpression>();
/*
* The actual number of partitioned table scans in the query (when supported, self-joins should count as multiple).
*/
private int m_countOfPartitionedTables = -1;
/*
* The number of independently partitioned table scans in the query. This is initially the same as
* m_countOfPartitionedTables, but gets reduced by 1 each time a partitioned table (scan)'s partitioning column
* is seen to be filtered by equality to a constant value or to a previously scanned partition column.
* When the count is 0, the statement can be executed single-partition.
* When the count is 1, multi-partition execution can join any number of tables in the collector plan fragment.
* When the count is 2 or greater, the statement would require three or more fragments to execute, so is disallowed.
*/
private int m_countOfIndependentlyPartitionedTables = -1;
/*
* If true, and the target table it replicated,
* SP execution is strictly forbidden, even if requested.
*/
private boolean m_isDML = false;
/*
* The table and column name of a partitioning column, typically the first scanned, if there are more than one,
* proposed in feedback messages for possible use in single-partitioning annotations and attributes.
*/
private String m_fullColumnName;
private boolean m_joinValid = true;
/**
* @param specifiedValue non-null if only SP plans are to be assumed
* @param lockInInferredPartitioningConstant true if MP plans should be automatically optimized for SP where possible
*/
private StatementPartitioning(boolean inferPartitioning, boolean forceSP) {
m_inferPartitioning = inferPartitioning;
m_forceSP = forceSP;
}
public static StatementPartitioning forceSP() {
return new StatementPartitioning(false, true);
}
public static StatementPartitioning forceMP() {
return new StatementPartitioning(false, false);
}
public static StatementPartitioning inferPartitioning() {
return new StatementPartitioning(true, /* default to MP */ false);
}
public boolean isInferred() {
return m_inferPartitioning;
}
/**
* @return A new PartitioningForStatement
*/
@Override
public Object clone() {
return new StatementPartitioning(m_inferPartitioning, m_forceSP);
}
/**
* accessor
*/
public boolean wasSpecifiedAsSingle() {
return m_forceSP && ! m_inferPartitioning;
}
/**
* Returns true if the expression can be used to restrict plan execution to a single partition.
* For now this is anything other than a constant or parameter. (In the future, one could
* imagine evaluating expressions like sqrt(8 * 8) and the like during planning)
*
* @param expr The expression to consider
* @return true or false
*/
private static boolean isUsefulPartitioningExpression(AbstractExpression expr) {
if (expr instanceof ParameterValueExpression) {
return true;
}
if (expr instanceof ConstantValueExpression) {
return true;
}
return false;
}
/**
* @param string table.column name of a(nother) equality-filtered partitioning column
* @param constExpr -- a constant/parameter-based expression that equality-filters the partitioning column
*/
public void addPartitioningExpression(String fullColumnName, AbstractExpression constExpr,
VoltType valueType) {
if (m_fullColumnName == null) {
m_fullColumnName = fullColumnName;
}
m_inferredExpression.add(constExpr);
if (constExpr instanceof ParameterValueExpression) {
ParameterValueExpression pve = (ParameterValueExpression)constExpr;
m_inferredParameterIndex = pve.getParameterIndex();
} else {
m_inferredValue = ConstantValueExpression.extractPartitioningValue(valueType, constExpr);
}
}
/**
* For a multi-partition statement that can definitely be run SP, this is a constant partitioning key value
* inferred from the analysis (suitable for hashinating).
* If null, SP may not be safe, or the partitioning may be based on something less obvious like a parameter or constant expression.
*
* @return an instance of String or an instance of container class Long
*/
public Object getInferredPartitioningValue() {
return m_inferredValue;
}
public int getInferredParameterIndex() {
return m_inferredParameterIndex;
}
/**
* accessor
*/
public int getCountOfPartitionedTables() {
// Should always have been set, early on.
assert(m_countOfPartitionedTables != -1);
return m_countOfPartitionedTables;
}
/**
* accessor
*/
public int getCountOfIndependentlyPartitionedTables() {
return m_countOfIndependentlyPartitionedTables;
}
/**
* Returns true if partitioning inference has been requested, and
* at least one of the following is true:
* - We are not doing DML on a replicated table, OR
* - There is a single useful partitioning expression
*/
public boolean isInferredSingle() {
return m_inferPartitioning &&
(((m_countOfIndependentlyPartitionedTables == 0) && ! m_isDML) ||
(singlePartitioningExpression() != null));
}
/**
* Returns true if the statement will require two fragments.
*/
public boolean requiresTwoFragments() {
if (m_inferPartitioning) {
if (isInferredSingle()) {
return false;
}
} else {
if (m_forceSP || (m_countOfPartitionedTables == 0)) {
return false;
}
}
return true;
}
/**
* smart accessor - only returns a value if it was unique and is useful
* @return
*/
public AbstractExpression singlePartitioningExpression() {
AbstractExpression e = singlePartitioningExpressionForReport();
if (e != null && isUsefulPartitioningExpression(e)) {
return e;
}
return null;
}
/**
* smart accessor - only returns a value if it was unique.
* @return
*/
public AbstractExpression singlePartitioningExpressionForReport() {
if (m_inferredExpression.size() == 1) {
return m_inferredExpression.iterator().next();
}
return null;
}
/**
* accessor
*/
public boolean getIsReplicatedTableDML() {
return m_isDML && (m_countOfIndependentlyPartitionedTables == 0);
}
/**
* @param parameter potentially enabling replicatedTableDML check
*/
public void setIsDML() { m_isDML = true; }
/**
* accessor
* @return
*/
public String getFullColumnName() {
return m_fullColumnName;
}
/**
* accessor
* @param partitioncolumn
*/
public void setPartitioningColumnForDML(Column partitioncolumn) {
if (m_inferPartitioning) {
m_partitionColForDML = partitioncolumn; // Not used in SELECT plans.
}
}
/**
* @return
*/
public Column getPartitionColForDML() {
return m_partitionColForDML;
}
/**
* Given the query's list of tables and its collection(s) of equality-filtered columns and their equivalents,
* determine whether all joins involving partitioned tables can be executed locally on a single partition.
* This is only the case when they include equality comparisons between partition key columns.
* VoltDB will reject joins of multiple partitioned tables unless all their partition keys are
* constrained to be equal to each other.
* Example: select * from T1, T2 where T1.ID = T2.ID
* Additionally, in this case, there may be a constant equality filter on any of the columns,
* which we want to extract as our SP partitioning parameter.
*
* @param tableAliasList The tables.
* @param valueEquivalence Their column equality filters
* @return the number of independently partitioned tables
* -- partitioned tables that aren't joined or filtered by the same value.
* The caller can raise an alarm if there is more than one.
*/
public void analyzeForMultiPartitionAccess(Collection<StmtTableScan> collection,
HashMap<AbstractExpression, Set<AbstractExpression>> valueEquivalence)
{
TupleValueExpression tokenPartitionKey = null;
Set< Set<AbstractExpression> > eqSets = new HashSet< Set<AbstractExpression> >();
int unfilteredPartitionKeyCount = 0;
// reset this flag to forget the last result of the multiple partition access path.
// AdHoc with parameters will call this function at least two times
// By default this flag should be true.
m_joinValid = true;
boolean subqueryHasReceiveNode = false;
boolean hasPartitionedTableJoin = false;
// Iterate over the tables to collect partition columns.
for (StmtTableScan tableScan : collection) {
// Replicated tables don't need filter coverage.
if (tableScan.getIsReplicated()) {
continue;
}
// The partition column can be null in an obscure edge case.
// The table is declared non-replicated yet specifies no partitioning column.
// This can occur legitimately when views based on partitioned tables neglect to group by the partition column.
// The interpretation of this edge case is that the table has "randomly distributed data".
// In such a case, the table is valid for use by MP queries only and can only be joined with replicated tables
// because it has no recognized partitioning join key.
List<SchemaColumn> columnsNeedingCoverage = tableScan.getPartitioningColumns();
if (tableScan instanceof StmtSubqueryScan) {
StmtSubqueryScan subScan = (StmtSubqueryScan) tableScan;
subScan.promoteSinglePartitionInfo(valueEquivalence, eqSets);
if (subScan.hasReceiveNode()) {
if (subqueryHasReceiveNode) {
// Has found another subquery with receive node on the same level
// Not going to support this kind of subquery join with 2 fragment plan.
m_joinValid = false;
// Still needs to count the independent partition tables
break;
}
subqueryHasReceiveNode = true;
if (subScan.isTableAggregate()) {
// Partition Table Aggregate only return one aggregate row.
// It has been marked with receive node, any join or processing based on
// this table aggregate subquery should be done on coordinator.
// Joins: has to be replicated table
// Any process based on this subquery should require 1 fragment only.
continue;
}
} else {
// this subquery partition table without receive node
hasPartitionedTableJoin = true;
}
} else {
// This table is a partition table
hasPartitionedTableJoin = true;
}
boolean unfiltered = true;
for (AbstractExpression candidateColumn : valueEquivalence.keySet()) {
if ( ! (candidateColumn instanceof TupleValueExpression)) {
continue;
}
TupleValueExpression candidatePartitionKey = (TupleValueExpression) candidateColumn;
if (! canCoverPartitioningColumn(candidatePartitionKey, columnsNeedingCoverage)) {
continue;
}
unfiltered = false;
if (tokenPartitionKey == null) {
tokenPartitionKey = candidatePartitionKey;
}
eqSets.add(valueEquivalence.get(candidatePartitionKey));
}
if (unfiltered) {
++unfilteredPartitionKeyCount;
}
} // end for each table StmtTableScan in the collection
m_countOfIndependentlyPartitionedTables = eqSets.size() + unfilteredPartitionKeyCount;
if (m_countOfIndependentlyPartitionedTables > 1) {
m_joinValid = false;
}
// This is the case that subquery with receive node join with another partition table
// on outer level. Not going to support this kind of join.
if (subqueryHasReceiveNode && hasPartitionedTableJoin) {
m_joinValid = false;
}
if ((unfilteredPartitionKeyCount == 0) && (eqSets.size() == 1)) {
for (Set<AbstractExpression> partitioningValues : eqSets) {
for (AbstractExpression constExpr : partitioningValues) {
if (constExpr instanceof TupleValueExpression) {
continue;
}
VoltType valueType = tokenPartitionKey.getValueType();
addPartitioningExpression(tokenPartitionKey.getTableName() +
'.' + tokenPartitionKey.getColumnName(), constExpr, valueType);
// Only need one constant value.
break;
}
}
}
}
public boolean isJoinValid() {
return m_joinValid;
}
private static boolean canCoverPartitioningColumn(TupleValueExpression candidatePartitionKey,
List<SchemaColumn> columnsNeedingCoverage) {
if (columnsNeedingCoverage == null)
return false;
for (SchemaColumn col: columnsNeedingCoverage) {
String partitionedTableAlias = col.getTableAlias();
String columnNeedingCoverage = col.getColumnAlias();
assert(candidatePartitionKey.getTableAlias() != null);
if ( ! candidatePartitionKey.getTableAlias().equals(partitionedTableAlias)) {
continue;
}
String candidateColumnName = candidatePartitionKey.getColumnName();
if ( ! candidateColumnName.equals(columnNeedingCoverage)) {
continue;
}
// Maybe need more checkings
return true;
}
return false;
}
/**
* @param tableCacheList
* @throws PlanningErrorException
*/
void analyzeTablePartitioning(Collection<StmtTableScan> collection)
throws PlanningErrorException
{
m_countOfPartitionedTables = 0;
// Do we have a need for a distributed scan at all?
// Iterate over the tables to collect partition columns.
for (StmtTableScan tableScan : collection) {
if ( ! tableScan.getIsReplicated()) {
++m_countOfPartitionedTables;
}
}
// Initial guess -- as if no equality filters.
m_countOfIndependentlyPartitionedTables = m_countOfPartitionedTables;
}
}