/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.planner;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.json_voltpatches.JSONException;
import org.voltdb.VoltType;
import org.voltdb.catalog.CatalogMap;
import org.voltdb.catalog.Cluster;
import org.voltdb.catalog.Column;
import org.voltdb.catalog.ColumnRef;
import org.voltdb.catalog.Connector;
import org.voltdb.catalog.ConnectorTableInfo;
import org.voltdb.catalog.Database;
import org.voltdb.catalog.Index;
import org.voltdb.catalog.Table;
import org.voltdb.expressions.AbstractExpression;
import org.voltdb.expressions.AggregateExpression;
import org.voltdb.expressions.ExpressionUtil;
import org.voltdb.expressions.OperatorExpression;
import org.voltdb.expressions.TupleAddressExpression;
import org.voltdb.expressions.TupleValueExpression;
import org.voltdb.planner.ParsedSelectStmt.ParsedColInfo;
import org.voltdb.planner.microoptimizations.MicroOptimizationRunner;
import org.voltdb.planner.parseinfo.BranchNode;
import org.voltdb.planner.parseinfo.JoinNode;
import org.voltdb.planner.parseinfo.StmtSubqueryScan;
import org.voltdb.planner.parseinfo.StmtTableScan;
import org.voltdb.plannodes.AbstractJoinPlanNode;
import org.voltdb.plannodes.AbstractPlanNode;
import org.voltdb.plannodes.AbstractScanPlanNode;
import org.voltdb.plannodes.AggregatePlanNode;
import org.voltdb.plannodes.DeletePlanNode;
import org.voltdb.plannodes.DistinctPlanNode;
import org.voltdb.plannodes.HashAggregatePlanNode;
import org.voltdb.plannodes.IndexScanPlanNode;
import org.voltdb.plannodes.InsertPlanNode;
import org.voltdb.plannodes.LimitPlanNode;
import org.voltdb.plannodes.MaterializePlanNode;
import org.voltdb.plannodes.NestLoopPlanNode;
import org.voltdb.plannodes.NodeSchema;
import org.voltdb.plannodes.OrderByPlanNode;
import org.voltdb.plannodes.PartialAggregatePlanNode;
import org.voltdb.plannodes.ProjectionPlanNode;
import org.voltdb.plannodes.ReceivePlanNode;
import org.voltdb.plannodes.SchemaColumn;
import org.voltdb.plannodes.SendPlanNode;
import org.voltdb.plannodes.SeqScanPlanNode;
import org.voltdb.plannodes.UnionPlanNode;
import org.voltdb.plannodes.UpdatePlanNode;
import org.voltdb.types.ExpressionType;
import org.voltdb.types.IndexType;
import org.voltdb.types.JoinType;
import org.voltdb.types.PlanNodeType;
import org.voltdb.types.SortDirectionType;
import org.voltdb.utils.CatalogUtil;
/**
* The query planner accepts catalog data, SQL statements from the catalog, then
* outputs a set of complete and correct query plans. It will output MANY plans
* and some of them will be stupid. The best plan will be selected by computing
* resource usage statistics for the plans, then using those statistics to
* compute the cost of a specific plan. The plan with the lowest cost wins.
*
*/
public class PlanAssembler {
// The convenience struct to accumulate results after parsing multiple statements
private static class ParsedResultAccumulator {
public final boolean m_orderIsDeterministic;
public final boolean m_hasLimitOrOffset;
public final int m_planId;
public ParsedResultAccumulator(boolean orderIsDeterministic, boolean hasLimitOrOffset,
int planId)
{
m_orderIsDeterministic = orderIsDeterministic;
m_hasLimitOrOffset = hasLimitOrOffset;
m_planId = planId;
}
}
/** convenience pointer to the cluster object in the catalog */
final Cluster m_catalogCluster;
/** convenience pointer to the database object in the catalog */
final Database m_catalogDb;
/** parsed statement for an insert */
ParsedInsertStmt m_parsedInsert = null;
/** parsed statement for an update */
ParsedUpdateStmt m_parsedUpdate = null;
/** parsed statement for an delete */
ParsedDeleteStmt m_parsedDelete = null;
/** parsed statement for an select */
ParsedSelectStmt m_parsedSelect = null;
/** parsed statement for an union */
ParsedUnionStmt m_parsedUnion = null;
/** plan selector */
PlanSelector m_planSelector;
/** Describes the specified and inferred partition context. */
private StatementPartitioning m_partitioning;
public StatementPartitioning getPartition() {
return m_partitioning;
}
/** Error message */
String m_recentErrorMsg;
/**
* Used to generate the table-touching parts of a plan. All join-order and
* access path selection stuff is done by the SelectSubPlanAssember.
*/
SubPlanAssembler subAssembler = null;
/**
* Flag when the only expected plan for a statement has already been generated.
*/
boolean m_bestAndOnlyPlanWasGenerated = false;
/**
*
* @param catalogCluster
* Catalog info about the physical layout of the cluster.
* @param catalogDb
* Catalog info about schema, metadata and procedures.
* @param partitioning
* Describes the specified and inferred partition context.
*/
PlanAssembler(Cluster catalogCluster, Database catalogDb, StatementPartitioning partitioning, PlanSelector planSelector) {
m_catalogCluster = catalogCluster;
m_catalogDb = catalogDb;
m_partitioning = partitioning;
m_planSelector = planSelector;
}
String getSQLText() {
if (m_parsedDelete != null) {
return m_parsedDelete.m_sql;
}
else if (m_parsedInsert != null) {
return m_parsedInsert.m_sql;
}
else if (m_parsedUpdate != null) {
return m_parsedUpdate.m_sql;
}
else if (m_parsedSelect != null) {
return m_parsedSelect.m_sql;
}
assert(false);
return null;
}
/**
* Return true if tableList includes at least one matview.
*/
private static boolean tableListIncludesView(List<Table> tableList) {
for (Table table : tableList) {
if (table.getMaterializer() != null) {
return true;
}
}
return false;
}
/**
* Return true if tableList includes at least one export table.
*/
private boolean tableListIncludesExportOnly(List<Table> tableList) {
// the single well-known connector
Connector connector = m_catalogDb.getConnectors().get("0");
// no export tables with out a connector
if (connector == null) {
return false;
}
CatalogMap<ConnectorTableInfo> tableinfo = connector.getTableinfo();
// this loop is O(number-of-joins * number-of-export-tables)
// which seems acceptable if not great. Probably faster than
// re-hashing the export only tables for faster lookup.
for (Table table : tableList) {
for (ConnectorTableInfo ti : tableinfo) {
if (ti.getAppendonly() &&
ti.getTable().getTypeName().equalsIgnoreCase(table.getTypeName()))
{
return true;
}
}
}
return false;
}
/**
* Clear any old state and get ready to plan a new plan. The next call to
* getNextPlan() will return the first candidate plan for these parameters.
*
*/
void setupForNewPlans(AbstractParsedStmt parsedStmt) {
m_bestAndOnlyPlanWasGenerated = false;
m_partitioning.analyzeTablePartitioning(parsedStmt.m_tableAliasMap.values());
if (parsedStmt instanceof ParsedUnionStmt) {
m_parsedUnion = (ParsedUnionStmt) parsedStmt;
return;
}
if (parsedStmt instanceof ParsedSelectStmt) {
if (tableListIncludesExportOnly(parsedStmt.m_tableList)) {
throw new PlanningErrorException(
"Illegal to read an export table.");
}
m_parsedSelect = (ParsedSelectStmt) parsedStmt;
// Simplify the outer join if possible
if (m_parsedSelect.m_joinTree instanceof BranchNode) {
// The execution engine expects to see the outer table on the left side only
// which means that RIGHT joins need to be converted to the LEFT ones
((BranchNode)m_parsedSelect.m_joinTree).toLeftJoin();
simplifyOuterJoin((BranchNode)m_parsedSelect.m_joinTree);
}
subAssembler = new SelectSubPlanAssembler(m_catalogDb, m_parsedSelect, m_partitioning);
// Process the GROUP BY information, decide whether it is group by the partition column
for (ParsedColInfo groupbyCol: m_parsedSelect.m_groupByColumns) {
StmtTableScan scanTable = m_parsedSelect.m_tableAliasMap.get(groupbyCol.tableAlias);
// table alias may be from "VOLT_TEMP_TABLE".
if (scanTable != null && scanTable.getPartitioningColumns() != null) {
for (SchemaColumn pcol : scanTable.getPartitioningColumns()) {
if (pcol != null && pcol.getColumnName().equals(groupbyCol.columnName) ) {
m_parsedSelect.setHasPartitionColumnInGroupby();
break;
}
}
}
if (m_parsedSelect.hasPartitionColumnInGroupby()) {
break;
}
}
return;
}
// @TODO
// Need to use StmtTableScan instead
// check that no modification happens to views
if (tableListIncludesView(parsedStmt.m_tableList)) {
throw new PlanningErrorException("Illegal to modify a materialized view.");
}
m_partitioning.setIsDML();
// Check that only multi-partition writes are made to replicated tables.
// figure out which table we're updating/deleting
assert (parsedStmt.m_tableList.size() == 1);
Table targetTable = parsedStmt.m_tableList.get(0);
if (targetTable.getIsreplicated()) {
if (m_partitioning.wasSpecifiedAsSingle()) {
String msg = "Trying to write to replicated table '" + targetTable.getTypeName()
+ "' in a single-partition procedure.";
throw new PlanningErrorException(msg);
}
} else if (m_partitioning.wasSpecifiedAsSingle() == false) {
m_partitioning.setPartitioningColumnForDML(targetTable.getPartitioncolumn());
}
if (parsedStmt instanceof ParsedInsertStmt) {
m_parsedInsert = (ParsedInsertStmt) parsedStmt;
// The currently handled inserts are too simple to even require a subplan assembler. So, done.
return;
}
if (parsedStmt instanceof ParsedUpdateStmt) {
if (tableListIncludesExportOnly(parsedStmt.m_tableList)) {
throw new PlanningErrorException("Illegal to update an export table.");
}
m_parsedUpdate = (ParsedUpdateStmt) parsedStmt;
} else if (parsedStmt instanceof ParsedDeleteStmt) {
if (tableListIncludesExportOnly(parsedStmt.m_tableList)) {
throw new PlanningErrorException("Illegal to delete from an export table.");
}
m_parsedDelete = (ParsedDeleteStmt) parsedStmt;
} else {
throw new RuntimeException("Unknown subclass of AbstractParsedStmt.");
}
if ( ! m_partitioning.wasSpecifiedAsSingle()) {
//TODO: When updates and deletes can contain joins, this step may have to be
// deferred so that the valueEquivalence set can be analyzed per join order.
// This appears to be an unfortunate side effect of how the HSQL interface
// misleadingly organizes the placement of join/where filters on the statement tree.
// This throws off the accounting of equivalence join filters until they can be
// normalized in analyzeJoinFilters, but that normalization process happens on a
// per-join-order basis, and so, so must this analysis.
HashMap<AbstractExpression, Set<AbstractExpression>>
valueEquivalence = parsedStmt.analyzeValueEquivalence();
m_partitioning.analyzeForMultiPartitionAccess(parsedStmt.m_tableAliasMap.values(), valueEquivalence);
}
subAssembler = new WriterSubPlanAssembler(m_catalogDb, parsedStmt, m_partitioning);
}
/**
* Generate the best cost plan for the current SQL statement context.
*
* @param parsedStmt Current SQL statement to generate plan for
* @return The best cost plan or null.
*/
public CompiledPlan getBestCostPlan(AbstractParsedStmt parsedStmt) {
// parse any subqueries that the statement contains
List<StmtSubqueryScan> subqueryNodes = parsedStmt.getSubqueries();
ParsedResultAccumulator subQueryResult = null;
if (! subqueryNodes.isEmpty()) {
subQueryResult = getBestCostPlanForSubQueries(subqueryNodes);
}
// set up the plan assembler for this statement
setupForNewPlans(parsedStmt);
// get ready to find the plan with minimal cost
CompiledPlan rawplan = null;
// loop over all possible plans
while (true) {
rawplan = getNextPlan();
// stop this while loop when no more plans are generated
if (rawplan == null)
break;
// Update the best cost plan so far
m_planSelector.considerCandidatePlan(rawplan, parsedStmt);
}
CompiledPlan retval = m_planSelector.m_bestPlan;
if (subQueryResult != null && retval != null) {
boolean orderIsDeterministic;
if (subQueryResult.m_orderIsDeterministic) {
orderIsDeterministic = retval.isOrderDeterministic();
} else {
//TODO: this reliance on the vague isOrderDeterministicInSpiteOfUnorderedSubqueries test
// is subject to false negatives for determinism. It misses the subtlety of parent
// queries that surgically add orderings for specific "key" columns of a subquery result
// or a subquery-based join for an effectively deterministic result.
// The first step towards repairing this would involve detecting deterministic and
// non-deterministic subquery results IN CONTEXT where they are scanned in the parent
// query, so that the parent query can ensure that ALL the columns from a
// non-deterministic subquery are later sorted.
// The next step would be to extend the model for "subquery scans"
// to identify dependencies / uniqueness constraints in subquery results
// that can be exploited to impose determinism with fewer parent order by columns
// -- like just the keys.
orderIsDeterministic = retval.isOrderDeterministic() &&
parsedStmt.isOrderDeterministicInSpiteOfUnorderedSubqueries();
}
boolean hasLimitOrOffset =
subQueryResult.m_hasLimitOrOffset || retval.hasLimitOrOffset();
retval.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic);
// Need to re-attach the sub-queries plans to the best parent plan. The same best plan for each
// sub-query is reused with all parent candidate plans and needs to be reconnected with
// the final best parent plan
retval.rootPlanGraph = connectChildrenBestPlans(retval.rootPlanGraph);
}
// If we have content non-determinism on DML, then fail planning.
// This can happen in case of an INSERT INTO ... SELECT ... where the select statement has a limit on unordered data.
// This may also be a concern in the future if we allow subqueries in UPDATE and DELETE statements
// (e.g., WHERE c IN (SELECT ...))
if (retval != null && !retval.getReadOnly() && !retval.isOrderDeterministic()) {
String errorMsg = "DML statement manipulates data in content non-deterministic way ";
if (parsedStmt.m_isUpsert) {
throw new PlanningErrorException(errorMsg +
"(this may happen on UPSERT INTO ... SELECT, for example).");
}
if (retval.hasLimitOrOffset()) {
throw new PlanningErrorException(errorMsg +
"(this may happen on INSERT INTO ... SELECT, for example).");
}
}
return retval;
}
/**
* Output the best cost plan.
*
*/
public void finalizeBestCostPlan() {
m_planSelector.finalizeOutput();
}
/**
* Generate the best cost plans for the immediate sub-queries of the
* current SQL statement context.
* @param parsedStmt - SQL context containing sub queries
* @return ChildPlanResult
*/
private ParsedResultAccumulator getBestCostPlanForSubQueries(List<StmtSubqueryScan> subqueryNodes) {
int nextPlanId = 0;
boolean orderIsDeterministic = true;
boolean hasSignificantOffsetOrLimit = false;
for (StmtSubqueryScan subqueryScan : subqueryNodes) {
ParsedResultAccumulator parsedResult = planForParsedSubquery(subqueryScan, nextPlanId);
if (parsedResult == null) {
throw new PlanningErrorException(m_recentErrorMsg);
}
nextPlanId = parsedResult.m_planId;
orderIsDeterministic &= parsedResult.m_orderIsDeterministic;
// Offsets or limits in subqueries are only significant (only effect content determinism)
// when they apply to un-ordered subquery contents.
hasSignificantOffsetOrLimit |=
(( ! parsedResult.m_orderIsDeterministic) && parsedResult.m_hasLimitOrOffset);
}
// need to reset plan id for the entire SQL
m_planSelector.m_planId = nextPlanId;
return new ParsedResultAccumulator(orderIsDeterministic, hasSignificantOffsetOrLimit, nextPlanId);
}
/**
* Generate a unique and correct plan for the current SQL statement context.
* This method gets called repeatedly until it returns null, meaning there
* are no more plans.
*
* @return A not-previously returned query plan or null if no more
* computable plans.
*/
private CompiledPlan getNextPlan() {
CompiledPlan retval;
AbstractParsedStmt nextStmt = null;
if (m_parsedUnion != null) {
nextStmt = m_parsedUnion;
retval = getNextUnionPlan();
} else if (m_parsedSelect != null) {
nextStmt = m_parsedSelect;
retval = getNextSelectPlan();
} else if (m_parsedInsert != null) {
nextStmt = m_parsedInsert;
retval = getNextInsertPlan();
} else {
//TODO: push CompiledPlan construction into getNextUpdatePlan/getNextDeletePlan
//
retval = new CompiledPlan();
if (m_parsedUpdate != null) {
nextStmt = m_parsedUpdate;
retval.rootPlanGraph = getNextUpdatePlan();
// note that for replicated tables, multi-fragment plans
// need to divide the result by the number of partitions
} else if (m_parsedDelete != null) {
nextStmt = m_parsedDelete;
retval.rootPlanGraph = getNextDeletePlan();
// note that for replicated tables, multi-fragment plans
// need to divide the result by the number of partitions
} else {
throw new RuntimeException(
"setupForNewPlans not called or not successfull.");
}
assert (nextStmt.m_tableList.size() == 1);
retval.setReadOnly (false);
if (nextStmt.m_tableList.get(0).getIsreplicated()) {
retval.replicatedTableDML = true;
}
retval.statementGuaranteesDeterminism(false, true); // Until we support DML w/ subqueries/limits
}
if (retval == null || retval.rootPlanGraph == null) {
return null;
}
assert (nextStmt != null);
retval.parameters = nextStmt.getParameters();
return retval;
}
/**
* This is a UNION specific method. Generate a unique and correct plan
* for the current SQL UNION statement by building the best plans for each individual statements
* within the UNION.
*
* @return A union plan or null.
*/
private CompiledPlan getNextUnionPlan() {
// Since only the one "best" plan is considered,
// this method should be called only once.
if (m_bestAndOnlyPlanWasGenerated) {
return null;
}
m_bestAndOnlyPlanWasGenerated = true;
// Simply return an union plan node with a corresponding union type set
AbstractPlanNode subUnionRoot = new UnionPlanNode(m_parsedUnion.m_unionType);
m_recentErrorMsg = null;
ArrayList<CompiledPlan> childrenPlans = new ArrayList<CompiledPlan>();
StatementPartitioning commonPartitioning = null;
// Build best plans for the children first
int planId = 0;
for (AbstractParsedStmt parsedChildStmt : m_parsedUnion.m_children) {
StatementPartitioning partitioning = (StatementPartitioning)m_partitioning.clone();
PlanSelector processor = (PlanSelector) m_planSelector.clone();
processor.m_planId = planId;
PlanAssembler assembler = new PlanAssembler(
m_catalogCluster, m_catalogDb, partitioning, processor);
CompiledPlan bestChildPlan = assembler.getBestCostPlan(parsedChildStmt);
partitioning = assembler.getPartition();
// make sure we got a winner
if (bestChildPlan == null) {
m_recentErrorMsg = assembler.getErrorMessage();
if (m_recentErrorMsg == null) {
m_recentErrorMsg = "Unable to plan for statement. Error unknown.";
}
return null;
}
childrenPlans.add(bestChildPlan);
// Make sure that next child's plans won't override current ones.
planId = processor.m_planId;
// Decide whether child statements' partitioning is compatible.
if (commonPartitioning == null) {
commonPartitioning = partitioning;
continue;
}
AbstractExpression statementPartitionExpression = partitioning.singlePartitioningExpression();
if (commonPartitioning.requiresTwoFragments()) {
if (partitioning.requiresTwoFragments() || statementPartitionExpression != null) {
// If two child statements need to use a second fragment,
// it can't currently be a two-fragment plan.
// The coordinator expects a single-table result from each partition.
// Also, currently the coordinator of a two-fragment plan is not allowed to
// target a particular partition, so neither can the union of the coordinator
// and a statement that wants to run single-partition.
throw new PlanningErrorException(
"Statements are too complex in set operation using multiple partitioned tables.");
}
// the new statement is apparently a replicated read and has no effect on partitioning
continue;
}
AbstractExpression commonPartitionExpression = commonPartitioning.singlePartitioningExpression();
if (commonPartitionExpression == null) {
// the prior statement(s) were apparently replicated reads
// and have no effect on partitioning
commonPartitioning = partitioning;
continue;
}
if (partitioning.requiresTwoFragments()) {
// Again, currently the coordinator of a two-fragment plan is not allowed to
// target a particular partition, so neither can the union of the coordinator
// and a statement that wants to run single-partition.
throw new PlanningErrorException(
"Statements are too complex in set operation using multiple partitioned tables.");
}
if (statementPartitionExpression == null) {
// the new statement is apparently a replicated read and has no effect on partitioning
continue;
}
if ( ! commonPartitionExpression.equals(statementPartitionExpression)) {
throw new PlanningErrorException(
"Statements use conflicting partitioned table filters in set operation or sub-query.");
}
}
if (commonPartitioning != null) {
m_partitioning = commonPartitioning;
}
// need to reset plan id for the entire UNION
m_planSelector.m_planId = planId;
// Add and link children plans
for (CompiledPlan selectPlan : childrenPlans) {
subUnionRoot.addAndLinkChild(selectPlan.rootPlanGraph);
}
CompiledPlan retval = new CompiledPlan();
retval.rootPlanGraph = subUnionRoot;
retval.setReadOnly(true);
retval.sql = m_planSelector.m_sql;
boolean orderIsDeterministic = m_parsedUnion.isOrderDeterministic();
boolean hasLimitOrOffset = m_parsedUnion.hasLimitOrOffset();
retval.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic);
// compute the cost - total of all children
retval.cost = 0.0;
for (CompiledPlan bestChildPlan : childrenPlans) {
retval.cost += bestChildPlan.cost;
}
return retval;
}
private ParsedResultAccumulator planForParsedSubquery(StmtSubqueryScan subqueryScan, int planId) {
AbstractParsedStmt subQuery = subqueryScan.getSubqueryStmt();
assert(subQuery != null);
PlanSelector selector = (PlanSelector) m_planSelector.clone();
selector.m_planId = planId;
StatementPartitioning currentPartitioning = (StatementPartitioning)m_partitioning.clone();
PlanAssembler assembler = new PlanAssembler(
m_catalogCluster, m_catalogDb, currentPartitioning, selector);
CompiledPlan compiledPlan = assembler.getBestCostPlan(subQuery);
// make sure we got a winner
if (compiledPlan == null) {
String tbAlias = subqueryScan.getTableAlias();
m_recentErrorMsg = "Subquery statement for table " + tbAlias
+ " has error: " + assembler.getErrorMessage();
if (m_recentErrorMsg == null) {
m_recentErrorMsg = "Unable to plan for subquery statement for table " + tbAlias;
}
return null;
}
subqueryScan.setSubqueriesPartitioning(currentPartitioning);
// Remove the coordinator send/receive pair.
// It will be added later for the whole plan
compiledPlan.rootPlanGraph = subqueryScan.processReceiveNode(compiledPlan.rootPlanGraph);
subqueryScan.setBestCostPlan(compiledPlan);
ParsedResultAccumulator parsedResult = new ParsedResultAccumulator(
compiledPlan.isOrderDeterministic(), compiledPlan.hasLimitOrOffset(),
selector.m_planId);
return parsedResult;
}
/**
* For each Subquery node in the plan tree attach the subquery plan to the parent node.
* @param initial plan
* @return A complete plan tree for the entire SQl.
*/
private AbstractPlanNode connectChildrenBestPlans(AbstractPlanNode parentPlan) {
if (parentPlan instanceof AbstractScanPlanNode) {
AbstractScanPlanNode scanNode = (AbstractScanPlanNode) parentPlan;
StmtTableScan tableScan = scanNode.getTableScan();
if (tableScan instanceof StmtSubqueryScan) {
CompiledPlan betsCostPlan = ((StmtSubqueryScan)tableScan).getBestCostPlan();
assert (betsCostPlan != null);
AbstractPlanNode subQueryRoot = betsCostPlan.rootPlanGraph;
subQueryRoot.disconnectParents();
scanNode.addAndLinkChild(subQueryRoot);
}
} else {
for (int i = 0; i < parentPlan.getChildCount(); ++i) {
connectChildrenBestPlans(parentPlan.getChild(i));
}
}
return parentPlan;
}
private CompiledPlan getNextSelectPlan() {
assert (subAssembler != null);
AbstractPlanNode subSelectRoot = subAssembler.nextPlan();
if (subSelectRoot == null) {
m_recentErrorMsg = subAssembler.m_recentErrorMsg;
return null;
}
AbstractPlanNode root = subSelectRoot;
boolean mvFixNeedsProjection = false;
/*
* If the access plan for the table in the join order was for a
* distributed table scan there must be a send/receive pair at the top
* EXCEPT for the special outer join case in which a replicated table
* was on the OUTER side of an outer join across from the (joined) scan
* of the partitioned table(s) (all of them) in the query. In that case,
* the one required send/receive pair is already in the plan below the
* inner side of a NestLoop join.
*/
if (m_partitioning.requiresTwoFragments()) {
boolean mvFixInfoCoordinatorNeeded = true;
boolean mvFixInfoEdgeCaseOuterJoin = false;
ArrayList<AbstractPlanNode> receivers = root.findAllNodesOfType(PlanNodeType.RECEIVE);
if (receivers.size() == 1) {
// The subplan SHOULD be good to go, but just make sure that it doesn't
// scan a partitioned table except under the ReceivePlanNode that was just found.
// Edge cases: left outer join with replicated table.
if (m_parsedSelect.m_mvFixInfo.needed()) {
mvFixInfoCoordinatorNeeded = false;
AbstractPlanNode receiveNode = receivers.get(0);
if (receiveNode.getParent(0) instanceof NestLoopPlanNode) {
if (subSelectRoot.hasInlinedIndexScanOfTable(m_parsedSelect.m_mvFixInfo.getMVTableName())) {
return getNextSelectPlan();
}
List<AbstractPlanNode> nljs = receiveNode.findAllNodesOfType(PlanNodeType.NESTLOOP);
List<AbstractPlanNode> nlijs = receiveNode.findAllNodesOfType(PlanNodeType.NESTLOOPINDEX);
// outer join edge case does not have any join plan node under receive node.
// This is like a single table case.
if (nljs.size() + nlijs.size() == 0) {
mvFixInfoEdgeCaseOuterJoin = true;
}
root = handleMVBasedMultiPartQuery(root, mvFixInfoEdgeCaseOuterJoin);
}
}
} else if (receivers.size() > 0) {
throw new PlanningErrorException(
"This special case join between an outer replicated table and " +
"an inner partitioned table is too complex and is not supported.");
} else {
root = SubPlanAssembler.addSendReceivePair(root);
// Root is a receive node here.
assert(root instanceof ReceivePlanNode);
if (m_parsedSelect.mayNeedAvgPushdown()) {
m_parsedSelect.switchOptimalSuiteForAvgPushdown();
}
if (m_parsedSelect.m_tableList.size() > 1 && m_parsedSelect.m_mvFixInfo.needed()
&& subSelectRoot.hasInlinedIndexScanOfTable(m_parsedSelect.m_mvFixInfo.getMVTableName())) {
// MV partitioned joined query needs reAggregation work on coordinator.
// Index scan on MV table can not be supported.
// So, in-lined index scan of Nested loop index join can not be possible.
return getNextSelectPlan();
}
}
root = handleAggregationOperators(root);
// Process the re-aggregate plan node and insert it into the plan.
if (m_parsedSelect.m_mvFixInfo.needed() && mvFixInfoCoordinatorNeeded) {
AbstractPlanNode tmpRoot = root;
root = handleMVBasedMultiPartQuery(root, mvFixInfoEdgeCaseOuterJoin);
if (root != tmpRoot) {
mvFixNeedsProjection = true;
}
}
} else {
/*
* There is no receive node and root is a single partition plan.
*/
// If there is no receive plan node and no distributed plan has been generated,
// the fix set for MV is not needed.
m_parsedSelect.m_mvFixInfo.setNeeded(false);
root = handleAggregationOperators(root);
}
if (m_parsedSelect.hasComplexAgg()) {
AbstractPlanNode aggNode = root.getChild(0);
root.clearChildren();
aggNode.clearParents();
aggNode = handleOrderBy(aggNode);
root.addAndLinkChild(aggNode);
} else {
root = handleOrderBy(root);
}
if (mvFixNeedsProjection || needProjectionNode(root)) {
root = addProjection(root);
}
if (m_parsedSelect.hasLimitOrOffset())
{
root = handleLimitOperator(root);
}
CompiledPlan plan = new CompiledPlan();
plan.rootPlanGraph = root;
plan.setReadOnly(true);
boolean orderIsDeterministic = m_parsedSelect.isOrderDeterministic();
boolean hasLimitOrOffset = m_parsedSelect.hasLimitOrOffset();
plan.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic);
// Apply the micro-optimization: Table count, Counting Index, Optimized Min/Max
MicroOptimizationRunner.applyAll(plan, m_parsedSelect);
return plan;
}
private boolean needProjectionNode (AbstractPlanNode root) {
if ( root instanceof AggregatePlanNode ||
root.getPlanNodeType() == PlanNodeType.DISTINCT ||
root.getPlanNodeType() == PlanNodeType.PROJECTION) {
return false;
}
// Assuming the restrictions: Order by columns are (1) columns from table
// (2) tag from display columns (3) actual expressions from display columns
// Currently, we do not allow order by complex expressions that are not in display columns
// If there is a complexGroupby at his point, it means that Display columns contain all the order by columns.
// In that way, this plan does not require another projection node on top of sort node.
if (m_parsedSelect.hasComplexGroupby()) {
return false;
}
if (root.getPlanNodeType() == PlanNodeType.RECEIVE &&
m_parsedSelect.hasPartitionColumnInGroupby()) {
// Top aggregate has been removed, its schema is exactly the same to
// its local aggregate node.
return false;
}
// TODO: Maybe we can remove this projection node for more cases as optimization in the future.
return true;
}
// ENG-4909 Bug: currently disable NESTLOOPINDEX plan for IN
private static boolean disableNestedLoopIndexJoinForInComparison (AbstractPlanNode root, AbstractParsedStmt parsedStmt) {
if (root.getPlanNodeType() == PlanNodeType.NESTLOOPINDEX) {
assert(parsedStmt != null);
return true;
}
return false;
}
private AbstractPlanNode getNextDeletePlan() {
assert (subAssembler != null);
// figure out which table we're deleting from
assert (m_parsedDelete.m_tableList.size() == 1);
Table targetTable = m_parsedDelete.m_tableList.get(0);
AbstractPlanNode subSelectRoot = subAssembler.nextPlan();
if (subSelectRoot == null) {
return null;
}
// ENG-4909 Bug: currently disable NESTLOOPINDEX plan for IN
if (disableNestedLoopIndexJoinForInComparison(subSelectRoot, m_parsedDelete)) {
// Recursion here, now that subAssembler.nextPlan() has been called,
// simply jumps ahead to the next plan (if any).
return getNextDeletePlan();
}
// generate the delete node with the right target table
DeletePlanNode deleteNode = new DeletePlanNode();
deleteNode.setTargetTableName(targetTable.getTypeName());
ProjectionPlanNode projectionNode = new ProjectionPlanNode();
AbstractExpression addressExpr = new TupleAddressExpression();
NodeSchema proj_schema = new NodeSchema();
// This planner-created column is magic.
proj_schema.addColumn(new SchemaColumn("VOLT_TEMP_TABLE",
"VOLT_TEMP_TABLE",
"tuple_address",
"tuple_address",
addressExpr));
projectionNode.setOutputSchema(proj_schema);
assert(subSelectRoot instanceof AbstractScanPlanNode);
// If the scan matches all rows, we can throw away the scan
// nodes and use a truncate delete node.
// Assume all index scans have filters in this context, so only consider seq scans.
if ( (subSelectRoot instanceof SeqScanPlanNode) &&
(((SeqScanPlanNode) subSelectRoot).getPredicate() == null)) {
deleteNode.setTruncate(true);
if (m_partitioning.wasSpecifiedAsSingle()) {
return deleteNode;
}
} else {
// connect the nodes to build the graph
deleteNode.addAndLinkChild(subSelectRoot);
// OPTIMIZATION: Projection Inline
// If the root node we got back from createSelectTree() is an
// AbstractScanNode, then
// we put the Projection node we just created inside of it
// When we inline this projection into the scan, we're going
// to overwrite any original projection that we might have inlined
// in order to simply cull the columns from the persistent table.
subSelectRoot.addInlinePlanNode(projectionNode);
}
if (m_partitioning.wasSpecifiedAsSingle() || m_partitioning.isInferredSingle()) {
return deleteNode;
}
// Send the local result counts to the coordinator.
AbstractPlanNode recvNode = SubPlanAssembler.addSendReceivePair(deleteNode);
// add a sum or a limit and send on top of the union
return addSumOrLimitAndSendToDMLNode(recvNode, targetTable.getIsreplicated());
}
private AbstractPlanNode getNextUpdatePlan() {
assert (subAssembler != null);
AbstractPlanNode subSelectRoot = subAssembler.nextPlan();
if (subSelectRoot == null) {
return null;
}
if (disableNestedLoopIndexJoinForInComparison(subSelectRoot, m_parsedUpdate)) {
// Recursion here, now that subAssembler.nextPlan() has been called,
// simply jumps ahead to the next plan (if any).
return getNextUpdatePlan();
}
UpdatePlanNode updateNode = new UpdatePlanNode();
Table targetTable = m_parsedUpdate.m_tableList.get(0);
updateNode.setTargetTableName(targetTable.getTypeName());
// set this to false until proven otherwise
updateNode.setUpdateIndexes(false);
ProjectionPlanNode projectionNode = new ProjectionPlanNode();
TupleAddressExpression tae = new TupleAddressExpression();
NodeSchema proj_schema = new NodeSchema();
// This planner-generated column is magic.
proj_schema.addColumn(new SchemaColumn("VOLT_TEMP_TABLE",
"VOLT_TEMP_TABLE",
"tuple_address",
"tuple_address",
tae));
// get the set of columns affected by indexes
Set<String> affectedColumns = getIndexedColumnSetForTable(targetTable);
// add the output columns we need to the projection
//
// Right now, the EE is going to use the original column names
// and compare these to the persistent table column names in the
// update executor in order to figure out which table columns get
// updated. We'll associate the actual values with VOLT_TEMP_TABLE
// to avoid any false schema/column matches with the actual table.
for (Entry<Column, AbstractExpression> col : m_parsedUpdate.columns.entrySet()) {
String tableName = col.getKey().getTypeName();
AbstractExpression expr = col.getValue();
expr.setInBytes(col.getKey().getInbytes());
proj_schema.addColumn(new SchemaColumn("VOLT_TEMP_TABLE",
"VOLT_TEMP_TABLE",
tableName,
tableName,
expr));
// check if this column is an indexed column
if (affectedColumns.contains(col.getKey().getTypeName()))
{
updateNode.setUpdateIndexes(true);
}
}
projectionNode.setOutputSchema(proj_schema);
// add the projection inline (TODO: this will break if more than one
// layer is below this)
//
// When we inline this projection into the scan, we're going
// to overwrite any original projection that we might have inlined
// in order to simply cull the columns from the persistent table.
assert(subSelectRoot instanceof AbstractScanPlanNode);
subSelectRoot.addInlinePlanNode(projectionNode);
// connect the nodes to build the graph
updateNode.addAndLinkChild(subSelectRoot);
if (m_partitioning.wasSpecifiedAsSingle() || m_partitioning.isInferredSingle()) {
return updateNode;
}
// Send the local result counts to the coordinator.
AbstractPlanNode recvNode = SubPlanAssembler.addSendReceivePair(updateNode);
// add a sum or a limit and send on top of the union
return addSumOrLimitAndSendToDMLNode(recvNode, targetTable.getIsreplicated());
}
static private AbstractExpression castExprIfNeeded(AbstractExpression expr, Column column) {
if (expr.getValueType().getValue() != column.getType() ||
expr.getValueSize() != column.getSize()) {
expr = new OperatorExpression(ExpressionType.OPERATOR_CAST, expr, null);
expr.setValueType(VoltType.get((byte) column.getType()));
// We don't really support parameterized casting, such as specifically to "VARCHAR(3)"
// vs. just VARCHAR, but set the size parameter anyway in this case to make sure that
// the tuple that gets the result of the cast can be properly formatted as inline.
// A too-wide value survives the cast (to generic VARCHAR of any length) but the
// attempt to cache the result in the inline temp tuple storage will throw an early
// runtime error on be half of the target table column.
// The important thing here is to leave the formatting hint in the output schema that
// drives the temp tuple layout.
expr.setValueSize(column.getSize());
}
return expr;
}
/**
* Get the next (only) plan for a SQL insertion. Inserts are pretty simple
* and this will only generate a single plan.
*
* @return The next plan for a given insert statement.
*/
private CompiledPlan getNextInsertPlan() {
// there's really only one way to do an insert, so just
// do it the right way once, then return null after that
if (m_bestAndOnlyPlanWasGenerated)
return null;
m_bestAndOnlyPlanWasGenerated = true;
// The child of the insert node produces rows containing values
// from one of
// - A VALUES clause. In this case the child node is a MaterializeNode
// - a SELECT statement as in "INSERT INTO ... SELECT ...". In this case
// the child node is the root of an arbitrary subplan.
// figure out which table we're inserting into
assert (m_parsedInsert.m_tableList.size() == 1);
Table targetTable = m_parsedInsert.m_tableList.get(0);
StmtSubqueryScan subquery = m_parsedInsert.isInsertWithSubquery() ?
m_parsedInsert.getSubqueries().get(0) : null;
CompiledPlan retval = null;
if (subquery != null) {
if (subquery.getBestCostPlan() == null) {
// Seems like this should really be caught earlier
// in getBestCostPlan, above.
throw new PlanningErrorException("INSERT INTO ... SELECT subquery could not be planned: "
+ m_recentErrorMsg);
}
InsertSubPlanAssembler subPlanAssembler =
new InsertSubPlanAssembler(m_catalogDb, m_parsedInsert, m_partitioning);
AbstractPlanNode subplan = subPlanAssembler.nextPlan();
if (subplan == null) {
throw new PlanningErrorException(subPlanAssembler.m_recentErrorMsg);
}
assert(m_partitioning.isJoinValid());
// Use the subquery's plan as the basis for the insert plan.
retval = subquery.getBestCostPlan();
}
else {
retval = new CompiledPlan();
}
retval.setReadOnly(false);
// Iterate over each column in the table we're inserting into:
// - Make sure we're supplying values for columns that require it
// - Set partitioning expressions for VALUES (...) case
CatalogMap<Column> targetTableColumns = targetTable.getColumns();
for (Column col : targetTableColumns) {
boolean needsValue = col.getNullable() == false && col.getDefaulttype() == 0;
if (needsValue && !m_parsedInsert.m_columns.containsKey(col)) {
// This check could be done during parsing?
throw new PlanningErrorException("Column " + col.getName()
+ " has no default and is not nullable.");
}
// hint that this statement can be executed SP.
if (col.equals(m_partitioning.getPartitionColForDML()) && subquery == null) {
// When AdHoc insert-into-select is supported, we'll need to be able to infer
// partitioning of the sub-select
AbstractExpression expr = m_parsedInsert.getExpressionForPartitioning(col);
String fullColumnName = targetTable.getTypeName() + "." + col.getTypeName();
m_partitioning.addPartitioningExpression(fullColumnName, expr, expr.getValueType());
}
}
NodeSchema matSchema = null;
if (subquery == null) {
matSchema = new NodeSchema();
}
int[] fieldMap = new int[m_parsedInsert.m_columns.size()];
int i = 0;
// The insert statement's set of columns are contained in a LinkedHashMap,
// meaning that we'll iterate over the columns here in the order that the user
// specified them in the original SQL. (If the statement didn't specify any
// columns, then all the columns will be in the map in schema order.)
// - Build the field map, used by insert executor to build tuple to execute
// - For VALUES(...) insert statements, build the materialize node's schema
for (Map.Entry<Column, AbstractExpression> e : m_parsedInsert.m_columns.entrySet()) {
Column col = e.getKey();
fieldMap[i] = col.getIndex();
if (matSchema != null) {
AbstractExpression valExpr = e.getValue();
valExpr.setInBytes(col.getInbytes());
// Patch over any mismatched expressions with an explicit cast.
// Most impossible-to-cast type combinations should have already been caught by the
// parser, but there are also runtime checks in the casting code
// -- such as for out of range values.
valExpr = castExprIfNeeded(valExpr, col);
matSchema.addColumn(new SchemaColumn("VOLT_TEMP_TABLE",
"VOLT_TEMP_TABLE",
col.getTypeName(),
col.getTypeName(),
valExpr));
}
i++;
}
// the root of the insert plan is always an InsertPlanNode
InsertPlanNode insertNode = new InsertPlanNode();
insertNode.setTargetTableName(targetTable.getTypeName());
// The field map tells the insert node
// where to put values produced by child into the row to be inserted.
insertNode.setFieldMap(fieldMap);
if (matSchema != null) {
MaterializePlanNode matNode = new MaterializePlanNode();
matNode.setOutputSchema(matSchema);
// connect the insert and the materialize nodes together
insertNode.addAndLinkChild(matNode);
retval.statementGuaranteesDeterminism(false, true);
} else {
insertNode.addAndLinkChild(retval.rootPlanGraph);
}
if (m_partitioning.wasSpecifiedAsSingle() || m_partitioning.isInferredSingle()) {
insertNode.setMultiPartition(false);
retval.rootPlanGraph = insertNode;
return retval;
}
insertNode.setMultiPartition(true);
AbstractPlanNode recvNode = SubPlanAssembler.addSendReceivePair(insertNode);
// add a count or a limit and send on top of the union
retval.rootPlanGraph = addSumOrLimitAndSendToDMLNode(recvNode, targetTable.getIsreplicated());
return retval;
}
/**
* Adds a sum or limit node followed by a send node to the given DML node. If the DML target
* is a replicated table, it will add a limit node, otherwise it adds a sum node.
*
* @param dmlRoot
* @param isReplicated Whether or not the target table is a replicated table.
* @return
*/
private static AbstractPlanNode addSumOrLimitAndSendToDMLNode(AbstractPlanNode dmlRoot, boolean isReplicated)
{
AbstractPlanNode sumOrLimitNode;
if (isReplicated) {
// Replicated table DML result doesn't need to be summed. All partitions should
// modify the same number of tuples in replicated table, so just pick the result from
// any partition.
LimitPlanNode limitNode = new LimitPlanNode();
sumOrLimitNode = limitNode;
limitNode.setLimit(1);
} else {
// create the nodes being pushed on top of dmlRoot.
AggregatePlanNode countNode = new AggregatePlanNode();
sumOrLimitNode = countNode;
// configure the count aggregate (sum) node to produce a single
// output column containing the result of the sum.
// Create a TVE that should match the tuple count input column
// This TVE is magic.
// really really need to make this less hard-wired
TupleValueExpression count_tve = new TupleValueExpression(
"VOLT_TEMP_TABLE", "VOLT_TEMP_TABLE", "modified_tuples", "modified_tuples", 0);
count_tve.setValueType(VoltType.BIGINT);
count_tve.setValueSize(VoltType.BIGINT.getLengthInBytesForFixedTypes());
countNode.addAggregate(ExpressionType.AGGREGATE_SUM, false, 0, count_tve);
// The output column. Not really based on a TVE (it is really the
// count expression represented by the count configured above). But
// this is sufficient for now. This looks identical to the above
// TVE but it's logically different so we'll create a fresh one.
TupleValueExpression tve = new TupleValueExpression(
"VOLT_TEMP_TABLE", "VOLT_TEMP_TABLE", "modified_tuples", "modified_tuples", 0);
tve.setValueType(VoltType.BIGINT);
tve.setValueSize(VoltType.BIGINT.getLengthInBytesForFixedTypes());
NodeSchema count_schema = new NodeSchema();
SchemaColumn col = new SchemaColumn("VOLT_TEMP_TABLE",
"VOLT_TEMP_TABLE",
"modified_tuples",
"modified_tuples",
tve);
count_schema.addColumn(col);
countNode.setOutputSchema(count_schema);
}
// connect the nodes to build the graph
sumOrLimitNode.addAndLinkChild(dmlRoot);
SendPlanNode sendNode = new SendPlanNode();
sendNode.addAndLinkChild(sumOrLimitNode);
return sendNode;
}
/**
* Given a relatively complete plan-sub-graph, apply a trivial projection
* (filter) to it. If the root node can embed the projection do so. If not,
* add a new projection node.
*
* @param rootNode
* The root of the plan-sub-graph to add the projection to.
* @return The new root of the plan-sub-graph (might be the same as the
* input).
*/
AbstractPlanNode addProjection(AbstractPlanNode rootNode) {
assert (m_parsedSelect != null);
assert (m_parsedSelect.m_displayColumns != null);
ProjectionPlanNode projectionNode =
new ProjectionPlanNode();
// Build the output schema for the projection based on the display columns
NodeSchema proj_schema = m_parsedSelect.getFinalProjectionSchema();
projectionNode.setOutputSchemaWithoutClone(proj_schema);
// if the projection can be done inline...
if (rootNode instanceof AbstractScanPlanNode) {
rootNode.addInlinePlanNode(projectionNode);
return rootNode;
} else {
projectionNode.addAndLinkChild(rootNode);
return projectionNode;
}
}
/**
* Create an order by node as required by the statement and make it a parent of root.
* @param root
* @return new orderByNode (the new root) or the original root if no orderByNode was required.
*/
AbstractPlanNode handleOrderBy(AbstractPlanNode root) {
assert (m_parsedSelect != null);
// Only sort when the statement has an ORDER BY.
if ( ! m_parsedSelect.hasOrderByColumns()) {
return root;
}
SortDirectionType sortDirection = SortDirectionType.INVALID;
// Skip the explicit ORDER BY plan step if an IndexScan is already providing the equivalent ordering.
// Note that even tree index scans that produce values in their own "key order" only report
// their sort direction != SortDirectionType.INVALID
// when they enforce an ordering equivalent to the one requested in the ORDER BY clause.
// Even an intervening non-hash aggregate will not interfere in this optimization.
AbstractPlanNode nonAggPlan = root;
if (root.getPlanNodeType() == PlanNodeType.AGGREGATE) {
nonAggPlan = root.getChild(0);
}
if (nonAggPlan instanceof IndexScanPlanNode) {
sortDirection = ((IndexScanPlanNode)nonAggPlan).getSortDirection();
}
// Optimization for NestLoopIndex on IN list, possibly other cases of ordered join results.
// Skip the explicit ORDER BY plan step if NestLoopIndex is providing the equivalent ordering
else if (nonAggPlan instanceof AbstractJoinPlanNode) {
sortDirection = ((AbstractJoinPlanNode)nonAggPlan).getSortDirection();
}
if (sortDirection != SortDirectionType.INVALID) {
return root;
}
OrderByPlanNode orderByNode = new OrderByPlanNode();
for (ParsedSelectStmt.ParsedColInfo col : m_parsedSelect.m_orderColumns) {
orderByNode.addSort(col.expression,
col.ascending ? SortDirectionType.ASC
: SortDirectionType.DESC);
}
orderByNode.addAndLinkChild(root);
return orderByNode;
}
/**
* Add a limit, pushed-down if possible, and return the new root.
* @param root top of the original plan
* @return new plan's root node
*/
private AbstractPlanNode handleLimitOperator(AbstractPlanNode root)
{
// The coordinator's top limit graph fragment for a MP plan.
// If planning "order by ... limit", getNextSelectPlan()
// will have already added an order by to the coordinator frag.
// This is the only limit node in a SP plan
LimitPlanNode topLimit = m_parsedSelect.getLimitNodeTop();
/*
* TODO: allow push down limit with distinct (select distinct C from T limit 5)
* or distinct in aggregates.
*/
AbstractPlanNode sendNode = null;
// Whether or not we can push the limit node down
boolean canPushDown = ! m_parsedSelect.hasDistinct();
if (canPushDown) {
sendNode = checkLimitPushDownViability(root);
if (sendNode == null) {
canPushDown = false;
} else {
canPushDown = m_parsedSelect.m_limitCanPushdown;
}
}
if (m_parsedSelect.m_mvFixInfo.needed()) {
// Do not push down limit for mv based distributed query.
canPushDown = false;
}
/*
* Push down the limit plan node when possible even if offset is set. If
* the plan is for a partitioned table, do the push down. Otherwise,
* there is no need to do the push down work, the limit plan node will
* be run in the partition.
*/
if (canPushDown) {
/*
* For partitioned table, the pushed-down limit plan node has a limit based
* on the combined limit and offset, which may require an expression if either of these
* was not a hard-coded constant and didn't get parameterized.
* The top level limit plan node remains the same, with the original limit and offset values.
*/
LimitPlanNode distLimit = m_parsedSelect.getLimitNodeDist();
// Disconnect the distributed parts of the plan below the SEND node
AbstractPlanNode distributedPlan = sendNode.getChild(0);
distributedPlan.clearParents();
sendNode.clearChildren();
// If the distributed limit must be performed on ordered input,
// ensure the order of the data on each partition.
distributedPlan = handleOrderBy(distributedPlan);
if (isInlineLimitPlanNodePossible(distributedPlan)) {
// Inline the distributed limit.
distributedPlan.addInlinePlanNode(distLimit);
sendNode.addAndLinkChild(distributedPlan);
} else {
distLimit.addAndLinkChild(distributedPlan);
// Add the distributed work back to the plan
sendNode.addAndLinkChild(distLimit);
}
}
// In future, inline LIMIT for join, Receive
// Then we do not need to distinguish the order by node.
// Switch if has Complex aggregations
if (m_parsedSelect.hasComplexAgg()) {
AbstractPlanNode child = root.getChild(0);
if (isInlineLimitPlanNodePossible(child)) {
child.addInlinePlanNode(topLimit);
} else {
root.clearChildren();
child.clearParents();
topLimit.addAndLinkChild(child);
root.addAndLinkChild(topLimit);
}
} else {
if (isInlineLimitPlanNodePossible(root)) {
root.addInlinePlanNode(topLimit);
} else if (root instanceof ProjectionPlanNode &&
isInlineLimitPlanNodePossible(root.getChild(0)) ) {
// In future, inlined this projection node for OrderBy and Aggregate
// Then we could delete this ELSE IF block.
root.getChild(0).addInlinePlanNode(topLimit);
} else {
topLimit.addAndLinkChild(root);
root = topLimit;
}
}
return root;
}
/**
* Inline limit plan node can be applied with ORDER BY node and serial aggregation node
* @param pn
* @return
*/
static private boolean isInlineLimitPlanNodePossible(AbstractPlanNode pn) {
if (pn instanceof OrderByPlanNode ||
pn.getPlanNodeType() == PlanNodeType.AGGREGATE)
{
return true;
}
return false;
}
AbstractPlanNode handleMVBasedMultiPartQuery (AbstractPlanNode root, boolean edgeCaseOuterJoin) {
MaterializedViewFixInfo mvFixInfo = m_parsedSelect.m_mvFixInfo;
HashAggregatePlanNode reAggNode = new HashAggregatePlanNode(mvFixInfo.getReAggregationPlanNode());
reAggNode.clearChildren();
reAggNode.clearParents();
AbstractPlanNode receiveNode = root;
AbstractPlanNode reAggParent = null;
// Find receive plan node and insert the constructed re-aggregation plan node.
if (root.getPlanNodeType() == PlanNodeType.RECEIVE) {
root = reAggNode;
} else {
List<AbstractPlanNode> recList = root.findAllNodesOfType(PlanNodeType.RECEIVE);
assert(recList.size() == 1);
receiveNode = recList.get(0);
reAggParent = receiveNode.getParent(0);
boolean result = reAggParent.replaceChild(receiveNode, reAggNode);
assert(result);
}
reAggNode.addAndLinkChild(receiveNode);
assert(receiveNode instanceof ReceivePlanNode);
AbstractPlanNode sendNode = receiveNode.getChild(0);
assert(sendNode instanceof SendPlanNode);
AbstractPlanNode sendNodeChild = sendNode.getChild(0);
HashAggregatePlanNode reAggNodeForReplace = null;
if (m_parsedSelect.m_tableList.size() > 1 && !edgeCaseOuterJoin) {
reAggNodeForReplace = reAggNode;
}
boolean find = mvFixInfo.processScanNodeWithReAggNode(sendNode, reAggNodeForReplace);
assert(find);
// If it is normal joined query, replace the node under receive node with materialized view scan node.
if (m_parsedSelect.m_tableList.size() > 1 && !edgeCaseOuterJoin) {
AbstractPlanNode joinNode = sendNodeChild;
// No agg, limit pushed down at this point.
assert(joinNode instanceof AbstractJoinPlanNode);
// Fix the node after Re-aggregation node.
joinNode.clearParents();
assert(mvFixInfo.m_scanNode != null);
mvFixInfo.m_scanNode.clearParents();
// replace joinNode with MV scan node on each partition.
sendNode.clearChildren();
sendNode.addAndLinkChild(mvFixInfo.m_scanNode);
// If reAggNode has parent node before we put it under join node,
// its parent will be the parent of the new join node. Update the root node.
if (reAggParent != null) {
reAggParent.replaceChild(reAggNode, joinNode);
root = reAggParent;
} else {
root = joinNode;
}
}
return root;
}
class IndexGroupByInfo {
boolean m_multiPartition = false;
List<Integer> m_coveredGroupByColumns;
boolean m_canBeFullySerialized = false;
AbstractPlanNode m_indexAccess = null;
public boolean isChangedToSerialAggregate() {
return m_canBeFullySerialized && m_indexAccess != null;
}
public boolean isChangedToPartialAggregate() {
return !m_canBeFullySerialized && m_indexAccess != null;
}
public boolean needHashAggregator(AbstractPlanNode root) {
// A hash is required to build up per-group aggregates in parallel vs.
// when there is only one aggregation over the entire table OR when the
// per-group aggregates are being built serially from the ordered output
// of an index scan.
// Currently, an index scan only claims to have a sort direction when its output
// matches the order demanded by the ORDER BY clause.
if (! m_parsedSelect.isGrouped()) {
return false;
}
if (isChangedToSerialAggregate() && ! m_multiPartition) {
return false;
}
boolean predeterminedOrdering = false;
if (root instanceof IndexScanPlanNode) {
if (((IndexScanPlanNode)root).getSortDirection() != SortDirectionType.INVALID) {
predeterminedOrdering = true;
}
}
else if (root instanceof AbstractJoinPlanNode) {
if (((AbstractJoinPlanNode)root).getSortDirection() != SortDirectionType.INVALID) {
predeterminedOrdering = true;
}
}
if (predeterminedOrdering) {
// The ordering predetermined by indexed access is known to cover (at least) the
// ORDER BY columns.
// Yet, any additional non-ORDER-BY columns in the GROUP BY clause will need
// partial aggregate.
if (m_parsedSelect.groupByIsAnOrderByPermutation()) {
return false;
}
}
return true;
}
}
private static AbstractPlanNode findSeqScanCandidateForGroupBy(AbstractPlanNode candidate) {
if (candidate.getPlanNodeType() == PlanNodeType.SEQSCAN &&
! candidate.isSubQuery()) {
// scan on sub-query does not support index, early exit here
// In future, support sub-query edge cases.
return candidate;
}
// For join node, find outer sequential scan plan node
if (candidate.getPlanNodeType() == PlanNodeType.NESTLOOP) {
assert(candidate.getChildCount() == 2);
return findSeqScanCandidateForGroupBy(candidate.getChild(0));
}
if (candidate.getPlanNodeType() == PlanNodeType.NESTLOOPINDEX) {
return findSeqScanCandidateForGroupBy(candidate.getChild(0));
}
return null;
}
/**
* For a seqscan feeding a GROUP BY, consider substituting an IndexScan that pre-sorts
* by the GROUP BY keys. This is a much bigger win if the aggregation can get pushed
* down so that the ordering is not lost by the lack of a mergesort in the RECEIVE node.
* @param candidate
* @param gbInfo
* @return whether planner can switch to index scan or not from sequential scan,
* and when the index scan has no parent plan node.
*/
private boolean switchToIndexScanForGroupBy(AbstractPlanNode candidate, IndexGroupByInfo gbInfo) {
if (! m_parsedSelect.isGrouped()) {
return false;
}
AbstractPlanNode sourceSeqScan = findSeqScanCandidateForGroupBy(candidate);
if (sourceSeqScan == null) {
return false;
}
assert(sourceSeqScan instanceof SeqScanPlanNode);
AbstractPlanNode parent = null;
if (sourceSeqScan.getParentCount() > 0) {
parent = sourceSeqScan.getParent(0);
}
AbstractPlanNode indexAccess = indexAccessForGroupByExprs(
(SeqScanPlanNode)sourceSeqScan, gbInfo);
if (indexAccess.getPlanNodeType() != PlanNodeType.INDEXSCAN) {
// does not find proper index to replace sequential scan
return false;
}
gbInfo.m_indexAccess = indexAccess;
if (parent != null) {
// have a parent and would like to replace the sequential scan to index scan
indexAccess.clearParents();
// For two children join node, index 0 is its outer side
parent.replaceChild(0, indexAccess);
return false;
}
// parent is null and switched to index scan from sequential scan
return true;
}
AbstractPlanNode handleAggregationOperators(AbstractPlanNode root) {
AggregatePlanNode aggNode = null;
/* Check if any aggregate expressions are present */
/*
* "Select A from T group by A" is grouped but has no aggregate operator
* expressions. Catch that case by checking the grouped flag
*/
if (m_parsedSelect.hasAggregateOrGroupby()) {
AggregatePlanNode topAggNode = null;
IndexGroupByInfo gbInfo = new IndexGroupByInfo();
if (root.getPlanNodeType() == PlanNodeType.RECEIVE) {
AbstractPlanNode candidate = root.getChild(0).getChild(0);
gbInfo.m_multiPartition = true;
switchToIndexScanForGroupBy(candidate, gbInfo);
} else if (switchToIndexScanForGroupBy(root, gbInfo)) {
root = gbInfo.m_indexAccess;
}
boolean needHashAgg = gbInfo.needHashAggregator(root);
// Construct the aggregate nodes
if (needHashAgg) {
if ( m_parsedSelect.m_mvFixInfo.needed() ) {
// TODO: may optimize this edge case in future
aggNode = new HashAggregatePlanNode();
} else {
if (gbInfo.isChangedToSerialAggregate()) {
assert(root instanceof ReceivePlanNode);
aggNode = new AggregatePlanNode();
} else if (gbInfo.isChangedToPartialAggregate()) {
aggNode = new PartialAggregatePlanNode(gbInfo.m_coveredGroupByColumns);
} else {
aggNode = new HashAggregatePlanNode();
}
topAggNode = new HashAggregatePlanNode();
}
} else {
aggNode = new AggregatePlanNode();
if ( ! m_parsedSelect.m_mvFixInfo.needed()) {
topAggNode = new AggregatePlanNode();
}
}
int outputColumnIndex = 0;
NodeSchema agg_schema = new NodeSchema();
NodeSchema top_agg_schema = new NodeSchema();
for (ParsedSelectStmt.ParsedColInfo col : m_parsedSelect.m_aggResultColumns) {
AbstractExpression rootExpr = col.expression;
AbstractExpression agg_input_expr = null;
SchemaColumn schema_col = null;
SchemaColumn top_schema_col = null;
if (rootExpr instanceof AggregateExpression) {
ExpressionType agg_expression_type = rootExpr.getExpressionType();
agg_input_expr = rootExpr.getLeft();
// A bit of a hack: ProjectionNodes after the
// aggregate node need the output columns here to
// contain TupleValueExpressions (effectively on a temp table).
// So we construct one based on the output of the
// aggregate expression, the column alias provided by HSQL,
// and the offset into the output table schema for the
// aggregate node that we're computing.
// Oh, oh, it's magic, you know..
TupleValueExpression tve = new TupleValueExpression(
"VOLT_TEMP_TABLE", "VOLT_TEMP_TABLE", "", col.alias, outputColumnIndex);
tve.setTypeSizeBytes(rootExpr.getValueType(), rootExpr.getValueSize(),
rootExpr.getInBytes());
boolean is_distinct = ((AggregateExpression)rootExpr).isDistinct();
aggNode.addAggregate(agg_expression_type, is_distinct, outputColumnIndex, agg_input_expr);
schema_col = new SchemaColumn("VOLT_TEMP_TABLE", "VOLT_TEMP_TABLE", "", col.alias, tve);
top_schema_col = new SchemaColumn("VOLT_TEMP_TABLE", "VOLT_TEMP_TABLE", "", col.alias, tve);
/*
* Special case count(*), count(), sum(), min() and max() to
* push them down to each partition. It will do the
* push-down if the select columns only contains the listed
* aggregate operators and other group-by columns. If the
* select columns includes any other aggregates, it will not
* do the push-down. - nshi
*/
if (topAggNode != null) {
ExpressionType top_expression_type = agg_expression_type;
/*
* For count(*), count() and sum(), the pushed-down
* aggregate node doesn't change. An extra sum()
* aggregate node is added to the coordinator to sum up
* the numbers from all the partitions. The input schema
* and the output schema of the sum() aggregate node is
* the same as the output schema of the push-down
* aggregate node.
*
* If DISTINCT is specified, don't do push-down for
* count() and sum()
*/
if (agg_expression_type == ExpressionType.AGGREGATE_COUNT_STAR ||
agg_expression_type == ExpressionType.AGGREGATE_COUNT ||
agg_expression_type == ExpressionType.AGGREGATE_SUM) {
if (is_distinct) {
topAggNode = null;
}
else {
top_expression_type = ExpressionType.AGGREGATE_SUM;
}
}
/*
* For min() and max(), the pushed-down aggregate node
* doesn't change. An extra aggregate node of the same
* type is added to the coordinator. The input schema
* and the output schema of the top aggregate node is
* the same as the output schema of the pushed-down
* aggregate node.
*/
else if (agg_expression_type != ExpressionType.AGGREGATE_MIN &&
agg_expression_type != ExpressionType.AGGREGATE_MAX) {
/*
* Unsupported aggregate for push-down (AVG for example).
*/
topAggNode = null;
}
if (topAggNode != null) {
/*
* Input column of the top aggregate node is the output column of the push-down aggregate node
*/
topAggNode.addAggregate(top_expression_type, is_distinct, outputColumnIndex, tve);
}
}
}
// If the rootExpr is not itself an AggregateExpression but simply contains one (or more)
// like "MAX(counter)+1" or "MAX(col)/MIN(col)" the assumptions about matching input and output
// columns break down.
else if (rootExpr.hasAnySubexpressionOfClass(AggregateExpression.class)) {
assert(false);
}
else
{
/*
* These columns are the pass through columns that are not being
* aggregated on. These are the ones from the SELECT list. They
* MUST already exist in the child node's output. Find them and
* add them to the aggregate's output.
*/
schema_col = new SchemaColumn(col.tableName, col.tableAlias, col.columnName, col.alias, col.expression);
AbstractExpression topExpr = null;
if (col.groupBy) {
topExpr = m_parsedSelect.m_groupByExpressions.get(col.alias);
} else {
topExpr = col.expression;
}
top_schema_col = new SchemaColumn(col.tableName, col.tableAlias, col.columnName, col.alias, topExpr);
}
agg_schema.addColumn(schema_col);
top_agg_schema.addColumn(top_schema_col);
outputColumnIndex++;
}
for (ParsedSelectStmt.ParsedColInfo col : m_parsedSelect.m_groupByColumns) {
aggNode.addGroupByExpression(col.expression);
if (topAggNode != null) {
topAggNode.addGroupByExpression(m_parsedSelect.m_groupByExpressions.get(col.alias));
}
}
aggNode.setOutputSchema(agg_schema);
if (topAggNode != null) {
if (m_parsedSelect.hasComplexGroupby()) {
topAggNode.setOutputSchema(top_agg_schema);
} else {
topAggNode.setOutputSchema(agg_schema);
}
}
// Never push down aggregation for MV fix case.
root = pushDownAggregate(root, aggNode, topAggNode, m_parsedSelect);
}
if (m_parsedSelect.isGrouped()) {
// DISTINCT is redundant with GROUP BY IFF all of the grouping columns are present in the display columns. Return early.
if (m_parsedSelect.displayColumnsContainAllGroupByColumns()) {
return root;
}
}
// DISTINCT is redundant on a single-row result. Return early.
else if (m_parsedSelect.hasAggregateExpression()) {
return root;
}
// Handle DISTINCT if it is not redundant with aggregation/grouping.
return handleDistinct(root);
}
// Turn sequential scan to index scan for group by if possible
private AbstractPlanNode indexAccessForGroupByExprs(SeqScanPlanNode root,
IndexGroupByInfo gbInfo) {
if (root.isSubQuery()) {
// sub-query edge case will not be handled now
return root;
}
String fromTableAlias = root.getTargetTableAlias();
assert(fromTableAlias != null);
ArrayList<ParsedColInfo> groupBys = m_parsedSelect.m_groupByColumns;
Table targetTable = m_catalogDb.getTables().get(root.getTargetTableName());
assert(targetTable != null);
CatalogMap<Index> allIndexes = targetTable.getIndexes();
List<Integer> maxCoveredGroupByColumns = new ArrayList<>();
ArrayList<AbstractExpression> maxCoveredBindings = null;
Index pickedUpIndex = null;
boolean foundAllGroupByCoveredIndex = false;
for (Index index : allIndexes) {
if ( ! IndexType.isScannable(index.getType())) {
continue;
}
ArrayList<AbstractExpression> bindings = new ArrayList<AbstractExpression>();
List<Integer> coveredGroupByColumns = calculateGroupbyColumnsCovered(
index, fromTableAlias, bindings);
if (coveredGroupByColumns.size() > maxCoveredGroupByColumns.size()) {
maxCoveredGroupByColumns = coveredGroupByColumns;
pickedUpIndex = index;
maxCoveredBindings = bindings;
if (maxCoveredGroupByColumns.size() == groupBys.size()) {
foundAllGroupByCoveredIndex = true;
break;
}
}
}
if (pickedUpIndex == null) {
return root;
}
IndexScanPlanNode indexScanNode = new IndexScanPlanNode(
root, null, pickedUpIndex, SortDirectionType.INVALID);
indexScanNode.setForGroupingOnly();
indexScanNode.setBindings(maxCoveredBindings);
gbInfo.m_coveredGroupByColumns = maxCoveredGroupByColumns;
gbInfo.m_canBeFullySerialized = foundAllGroupByCoveredIndex;
return indexScanNode;
}
private List<Integer> calculateGroupbyColumnsCovered(Index index, String fromTableAlias,
List<AbstractExpression> bindings) {
List<Integer> coveredGroupByColumns = new ArrayList<>();
ArrayList<ParsedColInfo> groupBys = m_parsedSelect.m_groupByColumns;
String exprsjson = index.getExpressionsjson();
if (exprsjson.isEmpty()) {
List<ColumnRef> indexedColRefs = CatalogUtil.getSortedCatalogItems(index.getColumns(), "index");
for (int j = 0; j < indexedColRefs.size(); j++) {
String indexColumnName = indexedColRefs.get(j).getColumn().getName();
// ignore order of keys in GROUP BY expr
int ithCovered = 0;
boolean foundPrefixedColumn = false;
for (; ithCovered < groupBys.size(); ithCovered++) {
AbstractExpression gbExpr = groupBys.get(ithCovered).expression;
if ( ! (gbExpr instanceof TupleValueExpression)) {
continue;
}
TupleValueExpression gbTVE = (TupleValueExpression)gbExpr;
// TVE column index has not been resolved currently
if ( ! fromTableAlias.equals(gbTVE.getTableAlias())) {
continue;
}
if (indexColumnName.equals(gbTVE.getColumnName())) {
foundPrefixedColumn = true;
break;
}
}
if (! foundPrefixedColumn) {
// no prefix match any more
break;
}
coveredGroupByColumns.add(ithCovered);
if (coveredGroupByColumns.size() == groupBys.size()) {
// covered all group by columns already
break;
}
}
} else {
StmtTableScan fromTableScan = m_parsedSelect.m_tableAliasMap.get(fromTableAlias);
// either pure expression index or mix of expressions and simple columns
List<AbstractExpression> indexedExprs = null;
try {
indexedExprs = AbstractExpression.fromJSONArrayString(exprsjson, fromTableScan);
} catch (JSONException e) {
e.printStackTrace();
// This case sounds impossible
return coveredGroupByColumns;
}
for (int j = 0; j < indexedExprs.size(); j++) {
AbstractExpression indexExpr = indexedExprs.get(j);
// ignore order of keys in GROUP BY expr
int ithCovered = 0;
List<AbstractExpression> binding = null;
for (; ithCovered < groupBys.size(); ithCovered++) {
AbstractExpression gbExpr = groupBys.get(ithCovered).expression;
binding = gbExpr.bindingToIndexedExpression(indexExpr);
if (binding != null) {
break;
}
}
if (binding == null) {
// no prefix match any more or covered all group by columns already
break;
}
bindings.addAll(binding);
coveredGroupByColumns.add(ithCovered);
if (coveredGroupByColumns.size() == groupBys.size()) {
// covered all group by columns already
break;
}
}
}
return coveredGroupByColumns;
}
/**
* Push the given aggregate if the plan is distributed, then add the
* coordinator node on top of the send/receive pair. If the plan
* is not distributed, or coordNode is not provided, the distNode
* is added at the top of the plan.
*
* Note: this works in part because the push-down node is also an acceptable
* top level node if the plan is not distributed. This wouldn't be true
* if we started pushing down something like (sum, count) to calculate
* a distributed average.
*
* @param root
* The root node
* @param distNode
* The node to push down
* @param coordNode
* The top node to put on top of the send/receive pair after
* push-down. If this is null, no push-down will be performed.
* @return The new root node.
*/
AbstractPlanNode pushDownAggregate(AbstractPlanNode root,
AggregatePlanNode distNode,
AggregatePlanNode coordNode,
ParsedSelectStmt selectStmt) {
boolean needCoordNode = !selectStmt.hasPartitionColumnInGroupby();
// remember that coordinating aggregation has a pushed-down
// counterpart deeper in the plan. this allows other operators
// to be pushed down past the receive as well.
if (coordNode != null) {
coordNode.m_isCoordinatingAggregator = true;
}
/*
* Push this node down to partition if it's distributed. First remove
* the send/receive pair, add the node, then put the send/receive pair
* back on top of the node, followed by another top node at the
* coordinator.
*/
if (coordNode != null && root instanceof ReceivePlanNode) {
AbstractPlanNode accessPlanTemp = root;
root = root.getChild(0).getChild(0);
root.clearParents();
distNode.addAndLinkChild(root);
root = distNode;
// Put the send/receive pair back into place
accessPlanTemp.getChild(0).clearChildren();
accessPlanTemp.getChild(0).addAndLinkChild(root);
root = accessPlanTemp;
// Add the top node
if (needCoordNode) {
coordNode.addAndLinkChild(root);
root = coordNode;
// Set post predicate for top Aggregation node
coordNode.setPostPredicate(m_parsedSelect.m_having);
} else {
// Set post predicate for final distributed Aggregation node
distNode.setPostPredicate(m_parsedSelect.m_having);
}
} else {
distNode.addAndLinkChild(root);
root = distNode;
// Set post predicate for final distributed Aggregation node
distNode.setPostPredicate(m_parsedSelect.m_having);
}
if (selectStmt.hasComplexAgg()) {
ProjectionPlanNode proj = new ProjectionPlanNode();
proj.addAndLinkChild(root);
proj.setOutputSchema(selectStmt.getFinalProjectionSchema());
root = proj;
}
return root;
}
/**
* Check if we can push the limit node down.
*
* @param root
* @return If we can push it down, the send plan node is returned. Otherwise,
* it returns null.
*/
protected AbstractPlanNode checkLimitPushDownViability(AbstractPlanNode root) {
AbstractPlanNode receiveNode = root;
// Return a mid-plan send node, if one exists and can host a distributed limit node.
// There is guaranteed to be at most a single receive/send pair.
// Abort the search if a node that a "limit" can't be pushed past is found before its receive node.
//
// Can only push past:
// * coordinatingAggregator: a distributed aggregator a copy of which has already been pushed down.
// Distributing a LIMIT to just above that aggregator is correct. (I've got some doubts that this is correct??? --paul)
//
// * order by: if the plan requires a sort, getNextSelectPlan() will have already added an ORDER BY.
// A distributed LIMIT will be added above a copy of that ORDER BY node.
//
// * projection: these have no effect on the application of limits.
//
// Return null if the plan is single-partition or if its "coordinator" part contains a push-blocking node type.
List<ParsedColInfo> orderBys = m_parsedSelect.orderByColumns();
boolean orderByCoversAllGroupBy = m_parsedSelect.groupByIsAnOrderByPermutation();
while (!(receiveNode instanceof ReceivePlanNode)) {
// Limitation: can only push past some nodes (see above comment)
// Delete the aggregate node case to handle ENG-6485, or say we don't push down meeting aggregate node
// TODO: We might want to optimize/push down "limit" for some cases
if (!(receiveNode instanceof OrderByPlanNode) &&
!(receiveNode instanceof ProjectionPlanNode) &&
! isValidAggregateNodeForLimitPushdown(receiveNode, orderBys, orderByCoversAllGroupBy) ) {
return null;
}
if (receiveNode instanceof OrderByPlanNode) {
// if group by partition key, limit can still push down if ordered by aggregate values.
if (! m_parsedSelect.hasPartitionColumnInGroupby() &&
isOrderByAggregationValue(m_parsedSelect.orderByColumns())) {
return null;
}
}
// Traverse...
if (receiveNode.getChildCount() == 0) {
return null;
}
// nothing that allows pushing past has multiple inputs
assert(receiveNode.getChildCount() == 1);
receiveNode = receiveNode.getChild(0);
}
return receiveNode.getChild(0);
}
private static boolean isOrderByAggregationValue(List<ParsedColInfo> orderBys) {
for (ParsedSelectStmt.ParsedColInfo col : orderBys) {
AbstractExpression rootExpr = col.expression;
// Fix ENG-3487: can't push down limits when results are ordered by aggregate values.
ArrayList<AbstractExpression> tves = rootExpr.findBaseTVEs();
for (AbstractExpression tve: tves) {
if (((TupleValueExpression) tve).hasAggregate()) {
return true;
}
}
}
return false;
}
private static boolean isValidAggregateNodeForLimitPushdown(AbstractPlanNode aggregateNode,
List<ParsedColInfo> orderBys, boolean orderByCoversAllGroupBy) {
if (aggregateNode instanceof AggregatePlanNode == false) {
return false;
}
if (aggregateNode.getParentCount() == 0) {
return false;
}
// Limitation: can only push past coordinating aggregation nodes
if (!((AggregatePlanNode)aggregateNode).m_isCoordinatingAggregator) {
return false;
}
AbstractPlanNode parent = aggregateNode.getParent(0);
AbstractPlanNode orderByNode = null;
if (parent instanceof OrderByPlanNode) {
orderByNode = parent;
} else if ( parent instanceof ProjectionPlanNode &&
parent.getParentCount() > 0 &&
parent.getParent(0) instanceof OrderByPlanNode) {
// Xin really wants inline project with aggregation
orderByNode = parent.getParent(0);
}
if (orderByNode == null) {
// When aggregate without order by and group by columns does not contain partition column,
// Limit should not be pushed down.
// Remember, when group by partition column, there will not be top aggregate plan node.
return false;
}
if (! orderByCoversAllGroupBy || isOrderByAggregationValue(orderBys)) {
return false;
}
return true;
}
/**
* Handle select distinct a from t
*
* @param root
* @return
*/
AbstractPlanNode handleDistinct(AbstractPlanNode root) {
if (m_parsedSelect.hasDistinct()) {
//TODO: The long-term goal to fix all the ills of Distinct is to implement distinct as
// a final GROUP BY all columns and eliminate Distinct as separately implemented
// PlanNode and Executor class types.
// The riskiest edge case in this approach to Distinct is when the distinct is applied
// on top of a GROUP BY -- either explicit in the query or implied by a materialized view --
// AND not all the GROUP BY expressions are in the result columns.
// If all the GROUP BY expressions are in the result columns, the result is already Distinct.
// We currently can't handle DISTINCT of multiple columns.
// Throw a planner error if this is attempted.
if (m_parsedSelect.m_displayColumns.size() > 1) {
throw new PlanningErrorException("Multiple DISTINCT columns currently unsupported");
}
AbstractExpression distinctExpr = null;
for (ParsedSelectStmt.ParsedColInfo col : m_parsedSelect.m_displayColumns) {
// Distinct can in theory handle any expression now, but it's
// untested so we'll balk on anything other than a TVE here
// --izzy
if (col.expression instanceof TupleValueExpression)
{
assert(distinctExpr == null);
distinctExpr = col.expression;
}
else
{
throw new PlanningErrorException("DISTINCT of an expression currently unsupported");
}
}
// Add a distinct node to the plan.
assert(distinctExpr != null);
root = addDistinctNodes(root, distinctExpr);
// aggregate handlers are expected to produce the required projection.
// the other aggregates do this inherently but distinct may need a
// projection node.
root = addProjection(root);
}
return root;
}
/**
* If plan is distributed than add distinct nodes to each partition and the coordinator.
* Otherwise simply add the distinct node on top of the current root
*
* @param root The root node
* @param expr The distinct expression
* @return The new root node.
*/
AbstractPlanNode addDistinctNodes(AbstractPlanNode root, AbstractExpression expr)
{
assert(root != null);
AbstractPlanNode accessPlanTemp = root;
if (root instanceof ReceivePlanNode && !m_parsedSelect.m_mvFixInfo.needed()) {
// Temporarily strip send/receive pair
accessPlanTemp = root.getChild(0).getChild(0);
accessPlanTemp.clearParents();
root.getChild(0).unlinkChild(accessPlanTemp);
// Add new distinct node to each partition
AbstractPlanNode distinctNode = addDistinctNode(accessPlanTemp, expr);
// Add send/receive pair back
root.getChild(0).addAndLinkChild(distinctNode);
}
// Add new distinct node to the coordinator
root = addDistinctNode(root, expr);
return root;
}
/**
* Build new distinct node and put it on top of the current root
*
* @param root The root node
* @param expr The distinct expression
* @return The new root node.
*/
private static AbstractPlanNode addDistinctNode(AbstractPlanNode root, AbstractExpression expr)
{
DistinctPlanNode distinctNode = new DistinctPlanNode();
distinctNode.setDistinctExpression(expr);
distinctNode.addAndLinkChild(root);
return distinctNode;
}
/**
* Get the unique set of names of all columns that are part of an index on
* the given table.
*
* @param table
* The table to build the list of index-affected columns with.
* @return The set of column names affected by indexes with duplicates
* removed.
*/
public static Set<String> getIndexedColumnSetForTable(Table table) {
HashSet<String> columns = new HashSet<String>();
for (Index index : table.getIndexes()) {
for (ColumnRef colRef : index.getColumns()) {
columns.add(colRef.getColumn().getTypeName());
}
}
return columns;
}
public String getErrorMessage() {
return m_recentErrorMsg;
}
/**
* Outer join simplification using null rejection.
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.43.2531
* Outerjoin Simplification and Reordering for Query Optimization
* by Cesar A. Galindo-Legaria , Arnon Rosenthal
* Algorithm:
* Traverse the join tree top-down:
* For each join node n1 do:
* For each expression expr (join and where) at the node n1
* For each join node n2 descended from n1 do:
* If expr rejects nulls introduced by n2 inner table,
* then convert n2 to an inner join. If n2 is a full join then need repeat this step
* for n2 inner and outer tables
*/
private static void simplifyOuterJoin(BranchNode joinTree) {
assert(joinTree != null);
List<AbstractExpression> exprs = new ArrayList<AbstractExpression>();
JoinNode leftNode = joinTree.getLeftNode();
JoinNode rightNode = joinTree.getRightNode();
// For the top level node only WHERE expressions need to be evaluated for NULL-rejection
if (leftNode.getWhereExpression() != null) {
exprs.add(leftNode.getWhereExpression());
}
if (rightNode.getWhereExpression() != null) {
exprs.add(rightNode.getWhereExpression());
}
simplifyOuterJoinRecursively(joinTree, exprs);
}
private static void simplifyOuterJoinRecursively(BranchNode joinNode, List<AbstractExpression> exprs) {
assert (joinNode != null);
JoinNode leftNode = joinNode.getLeftNode();
JoinNode rightNode = joinNode.getRightNode();
if (joinNode.getJoinType() == JoinType.LEFT) {
for (AbstractExpression expr : exprs) {
// Get all the tables underneath this node and
// see if the expression is NULL-rejecting for any of them
Collection<String> tableAliases = rightNode.generateTableJoinOrder();
boolean rejectNull = false;
for (String tableAlias : tableAliases) {
if (ExpressionUtil.isNullRejectingExpression(expr, tableAlias)) {
// We are done at this level
joinNode.setJoinType(JoinType.INNER);
rejectNull = true;
break;
}
}
if (rejectNull) {
break;
}
}
} else {
assert(joinNode.getJoinType() == JoinType.INNER);
}
// Now add this node expression to the list and descend
// In case of outer join, the inner node adds its WHERE and JOIN expressions, while
// the outer node adds its WHERE ones only - the outer node does not introduce NULLs
List<AbstractExpression> newExprs = new ArrayList<AbstractExpression>(exprs);
if (leftNode.getJoinExpression() != null) {
newExprs.add(leftNode.getJoinExpression());
}
if (rightNode.getJoinExpression() != null) {
newExprs.add(rightNode.getJoinExpression());
}
if (leftNode.getWhereExpression() != null) {
exprs.add(leftNode.getWhereExpression());
}
if (rightNode.getWhereExpression() != null) {
exprs.add(rightNode.getWhereExpression());
}
if (joinNode.getJoinType() == JoinType.INNER) {
exprs.addAll(newExprs);
if (leftNode instanceof BranchNode) {
simplifyOuterJoinRecursively((BranchNode)leftNode, exprs);
}
if (rightNode instanceof BranchNode) {
simplifyOuterJoinRecursively((BranchNode)rightNode, exprs);
}
} else {
if (rightNode instanceof BranchNode) {
newExprs.addAll(exprs);
simplifyOuterJoinRecursively((BranchNode)rightNode, newExprs);
}
if (leftNode instanceof BranchNode) {
simplifyOuterJoinRecursively((BranchNode)leftNode, exprs);
}
}
}
}