if (allSatisfyPreCondition) {
// statistics object that is combination of statistics from all
// relations involved in JOIN
Statistics stats = new Statistics();
long prodRows = 1;
List<Long> distinctVals = Lists.newArrayList();
boolean multiAttr = false;
Map<String, ColStatistics> joinedColStats = Maps.newHashMap();
Map<Integer, List<String>> joinKeys = Maps.newHashMap();
// get the join keys from parent ReduceSink operators
for (int pos = 0; pos < parents.size(); pos++) {
ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
Statistics parentStats = parent.getStatistics();
prodRows *= parentStats.getNumRows();
List<ExprNodeDesc> keyExprs = parent.getConf().getKeyCols();
// multi-attribute join key
if (keyExprs.size() > 1) {
multiAttr = true;
}
// compute fully qualified join key column names. this name will be
// used to quickly look-up for column statistics of join key.
// TODO: expressions in join condition will be ignored. assign
// internal name for expressions and estimate column statistics for expression.
List<String> fqCols =
StatsUtils.getFullQualifedColNameFromExprs(keyExprs, parent.getColumnExprMap());
joinKeys.put(pos, fqCols);
Map<String, ExprNodeDesc> colExprMap = parent.getColumnExprMap();
RowSchema rs = parent.getSchema();
// get column statistics for all output columns
List<ColStatistics> cs =
StatsUtils.getColStatisticsFromExprMap(conf, parentStats, colExprMap, rs);
for (ColStatistics c : cs) {
if (c != null) {
joinedColStats.put(c.getFullyQualifiedColName(), c);
}
}
// since new statistics is derived from all relations involved in
// JOIN, we need to update the state information accordingly
stats.updateColumnStatsState(parentStats.getColumnStatsState());
}
// compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single
// attribute join, else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2))
// in case of multi-attribute join
long denom = 1;
if (multiAttr) {
List<Long> perAttrDVs = Lists.newArrayList();
int numAttr = joinKeys.get(0).size();
for (int idx = 0; idx < numAttr; idx++) {
for (Integer i : joinKeys.keySet()) {
String col = joinKeys.get(i).get(idx);
ColStatistics cs = joinedColStats.get(col);
if (cs != null) {
perAttrDVs.add(cs.getCountDistint());
}
}
distinctVals.add(getDenominator(perAttrDVs));
perAttrDVs.clear();
}
for (Long l : distinctVals) {
denom *= l;
}
} else {
for (List<String> jkeys : joinKeys.values()) {
for (String jk : jkeys) {
ColStatistics cs = joinedColStats.get(jk);
if (cs != null) {
distinctVals.add(cs.getCountDistint());
}
}
}
denom = getDenominator(distinctVals);
}
// column statistics from different sources are put together and rename
// fully qualified column names based on output schema of join operator
Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
RowSchema rs = jop.getSchema();
List<ColStatistics> outColStats = Lists.newArrayList();
for (ColumnInfo ci : rs.getSignature()) {
String key = ci.getInternalName();
ExprNodeDesc end = colExprMap.get(key);
if (end instanceof ExprNodeColumnDesc) {
String colName = ((ExprNodeColumnDesc) end).getColumn();
colName = StatsUtils.stripPrefixFromColumnName(colName);
String tabAlias = ((ExprNodeColumnDesc) end).getTabAlias();
String fqColName = StatsUtils.getFullyQualifiedColumnName(tabAlias, colName);
ColStatistics cs = joinedColStats.get(fqColName);
String outColName = key;
String outTabAlias = ci.getTabAlias();
outColName = StatsUtils.stripPrefixFromColumnName(outColName);
if (cs != null) {
cs.setColumnName(outColName);
cs.setTableAlias(outTabAlias);
}
outColStats.add(cs);
}
}
// update join statistics
stats.setColumnStats(outColStats);
long newRowCount = prodRows / denom;
stats.setNumRows(newRowCount);
stats.setDataSize(StatsUtils.getDataSizeFromColumnStats(newRowCount, outColStats));
jop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString());
}
} else {
// worst case when there are no column statistics
float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR);
int numParents = parents.size();
List<Long> parentRows = Lists.newArrayList();
List<Long> parentSizes = Lists.newArrayList();
int maxRowIdx = 0;
long maxRowCount = 0;
int idx = 0;
for (Operator<? extends OperatorDesc> op : parents) {
Statistics ps = op.getStatistics();
long rowCount = ps.getNumRows();
if (rowCount > maxRowCount) {
maxRowCount = rowCount;
maxRowIdx = idx;
}
parentRows.add(rowCount);
parentSizes.add(ps.getDataSize());
idx++;
}
long maxDataSize = parentSizes.get(maxRowIdx);
long newNumRows = (long) (joinFactor * maxRowCount * (numParents - 1));
long newDataSize = (long) (joinFactor * maxDataSize * (numParents - 1));
Statistics wcStats = new Statistics();
wcStats.setNumRows(newNumRows);
wcStats.setDataSize(newDataSize);
jop.setStatistics(wcStats);
if (LOG.isDebugEnabled()) {
LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString());
}
}
}
return null;
}