@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
GroupByOperator gop = (GroupByOperator) nd;
Operator<? extends OperatorDesc> parent = gop.getParentOperators().get(0);
Statistics parentStats = parent.getStatistics();
// parent stats are not populated yet
if (parentStats == null) {
return null;
}
AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
HiveConf conf = aspCtx.getConf();
long maxSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE);
List<AggregationDesc> aggDesc = gop.getConf().getAggregators();
Map<String, ExprNodeDesc> colExprMap = gop.getColumnExprMap();
RowSchema rs = gop.getSchema();
Statistics stats = null;
List<ColStatistics> colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats,
colExprMap, rs);
long cardinality;
long parallelism = 1L;
boolean mapSide = false;
boolean mapSideHashAgg = false;
long inputSize = 1L;
boolean containsGroupingSet = gop.getConf().isGroupingSetsPresent();
long sizeOfGroupingSet =
containsGroupingSet ? gop.getConf().getListGroupingSets().size() : 1L;
// There are different cases for Group By depending on map/reduce side, hash aggregation,
// grouping sets and column stats. If we don't have column stats, we just assume hash
// aggregation is disabled. Following are the possible cases and rule for cardinality
// estimation
// MAP SIDE:
// Case 1: NO column stats, NO hash aggregation, NO grouping sets — numRows
// Case 2: NO column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet
// Case 3: column stats, hash aggregation, NO grouping sets — Min(numRows / 2, ndvProduct * parallelism)
// Case 4: column stats, hash aggregation, grouping sets — Min((numRows * sizeOfGroupingSet) / 2, ndvProduct * parallelism * sizeOfGroupingSet)
// Case 5: column stats, NO hash aggregation, NO grouping sets — numRows
// Case 6: column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet
// REDUCE SIDE:
// Case 7: NO column stats — numRows / 2
// Case 8: column stats, grouping sets — Min(numRows, ndvProduct * sizeOfGroupingSet)
// Case 9: column stats, NO grouping sets - Min(numRows, ndvProduct)
if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator ||
gop.getChildOperators().get(0) instanceof AppMasterEventOperator) {
mapSide = true;
// consider approximate map side parallelism to be table data size
// divided by max split size
TableScanOperator top = OperatorUtils.findSingleOperatorUpstream(gop,
TableScanOperator.class);
// if top is null then there are multiple parents (RS as well), hence
// lets use parent statistics to get data size. Also maxSplitSize should
// be updated to bytes per reducer (1GB default)
if (top == null) {
inputSize = parentStats.getDataSize();
maxSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.BYTESPERREDUCER);
} else {
inputSize = top.getConf().getStatistics().getDataSize();
}
parallelism = (int) Math.ceil((double) inputSize / maxSplitSize);
}
if (isDebugEnabled) {
LOG.debug("STATS-" + gop.toString() + ": inputSize: " + inputSize + " maxSplitSize: " +
maxSplitSize + " parallelism: " + parallelism + " containsGroupingSet: " +
containsGroupingSet + " sizeOfGroupingSet: " + sizeOfGroupingSet);
}
try {
// satisfying precondition means column statistics is available
if (satisfyPrecondition(parentStats)) {
// check if map side aggregation is possible or not based on column stats
mapSideHashAgg = checkMapSideAggregation(gop, colStats, conf);
if (isDebugEnabled) {
LOG.debug("STATS-" + gop.toString() + " mapSideHashAgg: " + mapSideHashAgg);
}
stats = parentStats.clone();
stats.setColumnStats(colStats);
long ndvProduct = 1;
final long parentNumRows = stats.getNumRows();
// compute product of distinct values of grouping columns
for (ColStatistics cs : colStats) {
if (cs != null) {
long ndv = cs.getCountDistint();
if (cs.getNumNulls() > 0) {
ndv = StatsUtils.safeAdd(ndv, 1);
}
ndvProduct = StatsUtils.safeMult(ndvProduct, ndv);
} else {
if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
// the column must be an aggregate column inserted by GBY. We
// don't have to account for this column when computing product
// of NDVs
continue;
} else {
// partial column statistics on grouping attributes case.
// if column statistics on grouping attribute is missing, then
// assume worst case.
// GBY rule will emit half the number of rows if ndvProduct is 0
ndvProduct = 0;
}
break;
}
}
// if ndvProduct is 0 then column stats state must be partial and we are missing
// column stats for a group by column
if (ndvProduct == 0) {
ndvProduct = parentNumRows / 2;
if (isDebugEnabled) {
LOG.debug("STATS-" + gop.toString() + ": ndvProduct became 0 as some column does not" +
" have stats. ndvProduct changed to: " + ndvProduct);
}
}
if (mapSide) {
// MAP SIDE
if (mapSideHashAgg) {
if (containsGroupingSet) {
// Case 4: column stats, hash aggregation, grouping sets
cardinality = Math.min(
(StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet)) / 2,
StatsUtils.safeMult(StatsUtils.safeMult(ndvProduct, parallelism), sizeOfGroupingSet));
if (isDebugEnabled) {
LOG.debug("[Case 4] STATS-" + gop.toString() + ": cardinality: " + cardinality);
}
} else {
// Case 3: column stats, hash aggregation, NO grouping sets
cardinality = Math.min(parentNumRows / 2, StatsUtils.safeMult(ndvProduct, parallelism));
if (isDebugEnabled) {
LOG.debug("[Case 3] STATS-" + gop.toString() + ": cardinality: " + cardinality);
}
}
} else {
if (containsGroupingSet) {
// Case 6: column stats, NO hash aggregation, grouping sets
cardinality = StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet);
if (isDebugEnabled) {
LOG.debug("[Case 6] STATS-" + gop.toString() + ": cardinality: " + cardinality);
}
} else {
// Case 5: column stats, NO hash aggregation, NO grouping sets
cardinality = parentNumRows;
if (isDebugEnabled) {
LOG.debug("[Case 5] STATS-" + gop.toString() + ": cardinality: " + cardinality);
}
}
}
} else {
// REDUCE SIDE
// in reduce side GBY, we don't know if the grouping set was present or not. so get it
// from map side GBY
GroupByOperator mGop = OperatorUtils.findSingleOperatorUpstream(parent, GroupByOperator.class);
if (mGop != null) {
containsGroupingSet = mGop.getConf().isGroupingSetsPresent();
sizeOfGroupingSet = mGop.getConf().getListGroupingSets().size();
}
if (containsGroupingSet) {
// Case 8: column stats, grouping sets
cardinality = Math.min(parentNumRows, StatsUtils.safeMult(ndvProduct, sizeOfGroupingSet));