public static class GroupByStatsRule extends DefaultStatsRule implements NodeProcessor {
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
GroupByOperator gop = (GroupByOperator) nd;
Operator<? extends OperatorDesc> parent = gop.getParentOperators().get(0);
Statistics parentStats = parent.getStatistics();
AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
HiveConf conf = aspCtx.getConf();
int mapSideParallelism =
HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_SIDE_PARALLELISM);
List<AggregationDesc> aggDesc = gop.getConf().getAggregators();
Map<String, ExprNodeDesc> colExprMap = gop.getColumnExprMap();
RowSchema rs = gop.getSchema();
Statistics stats = null;
try {
if (satisfyPrecondition(parentStats)) {
stats = parentStats.clone();
List<ColStatistics> colStats =
StatsUtils.getColStatisticsFromExprMap(conf, parentStats, colExprMap, rs);
long dvProd = 1;
long newNumRows = 0;
// compute product of distinct values of grouping columns
for (ColStatistics cs : colStats) {
if (cs != null) {
long dv = cs.getCountDistint();
if (cs.getNumNulls() > 0) {
dv += 1;
dvProd *= dv;
} else {
// partial column statistics on grouping attributes case.
// if column statistics on grouping attribute is missing, then
// assume worst case.
// GBY rule will emit half the number of rows if dvProd is 0
dvProd = 0;
// map side
if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator) {
// since we do not know if hash-aggregation will be enabled or disabled
// at runtime we will assume that map-side group by does not do any
// reduction.hence no group by rule will be applied
// map-side grouping set present. if grouping set is present then
// multiply the number of rows by number of elements in grouping set
if (gop.getConf().isGroupingSetsPresent()) {
int multiplier = gop.getConf().getListGroupingSets().size();
// take into account the map-side parallelism as well, default is 1
multiplier *= mapSideParallelism;
newNumRows = multiplier * stats.getNumRows();
long dataSize = multiplier * stats.getDataSize();
for (ColStatistics cs : colStats) {
if (cs != null) {
long oldNumNulls = cs.getNumNulls();
long newNumNulls = multiplier * oldNumNulls;
} else {
// map side no grouping set
newNumRows = stats.getNumRows() * mapSideParallelism;
updateStats(stats, newNumRows, true);
} else {
// reduce side
newNumRows = applyGBYRule(stats.getNumRows(), dvProd);
updateStats(stats, newNumRows, true);
} else {
if (parentStats != null) {
// worst case, in the absence of column statistics assume half the rows are emitted
if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator) {
// map side
stats = parentStats.clone();
} else {
// reduce side
stats = parentStats.clone();
long newNumRows = parentStats.getNumRows() / 2;
updateStats(stats, newNumRows, false);
// if UDAFs are present, new columns needs to be added
if (!aggDesc.isEmpty() && stats != null) {
List<ColStatistics> aggColStats = Lists.newArrayList();
for (ColumnInfo ci : rs.getSignature()) {
// if the columns in row schema is not contained in column
// expression map, then those are the aggregate columns that
// are added GBY operator. we will estimate the column statistics
// for those newly added columns
if (!colExprMap.containsKey(ci.getInternalName())) {
String colName = ci.getInternalName();
colName = StatsUtils.stripPrefixFromColumnName(colName);
String tabAlias = ci.getTabAlias();
String colType = ci.getTypeName();
ColStatistics cs = new ColStatistics(tabAlias, colName, colType);
// if UDAF present and if column expression map is empty then it must
// be full aggregation query like count(*) in which case number of
// rows will be 1
if (colExprMap.isEmpty()) {
updateStats(stats, 1, true);
if (LOG.isDebugEnabled() && stats != null) {
LOG.debug("[0] STATS-" + gop.toString() + ": " + stats.extendedToString());
} catch (CloneNotSupportedException e) {
throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
return null;