// 3. Connect to metastore and get the stats
// 4. Compose rows and add it in FetchWork
// 5. Delete GBY - RS - GBY - SEL from the pipeline.
try {
TableScanOperator tsOp = (TableScanOperator) stack.get(0);
if(tsOp.getParentOperators() != null && tsOp.getParentOperators().size() > 0) {
// looks like a subq plan.
return null;
}
SelectOperator selOp = (SelectOperator)tsOp.getChildren().get(0);
for(ExprNodeDesc desc : selOp.getConf().getColList()) {
if (!(desc instanceof ExprNodeColumnDesc)) {
// Probably an expression, cant handle that
return null;
}
}
// Since we have done an exact match on TS-SEL-GBY-RS-GBY-SEL-FS
// we need not to do any instanceof checks for following.
GroupByOperator gbyOp = (GroupByOperator)selOp.getChildren().get(0);
ReduceSinkOperator rsOp = (ReduceSinkOperator)gbyOp.getChildren().get(0);
if (rsOp.getConf().getDistinctColumnIndices().size() > 0) {
// we can't handle distinct
return null;
}
selOp = (SelectOperator)rsOp.getChildOperators().get(0).getChildOperators().get(0);
List<AggregationDesc> aggrs = gbyOp.getConf().getAggregators();
if (!(selOp.getConf().getColList().size() == aggrs.size())) {
// all select columns must be aggregations
return null;
}
FileSinkOperator fsOp = (FileSinkOperator)(selOp.getChildren().get(0));
if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size() > 0) {
// looks like a subq plan.
return null;
}
Table tbl = pctx.getTopToTable().get(tsOp);
List<Object> oneRow = new ArrayList<Object>();
List<ObjectInspector> ois = new ArrayList<ObjectInspector>();
Hive hive = Hive.get(pctx.getConf());
for (AggregationDesc aggr : aggrs) {
if (aggr.getDistinct()) {
// our stats for NDV is approx, not accurate.
return null;
}
if (aggr.getGenericUDAFName().equals(GenericUDAFSum.class.getAnnotation(
Description.class).name())) {
if(!(aggr.getParameters().get(0) instanceof ExprNodeConstantDesc)){
return null;
}
Long rowCnt = getRowCnt(pctx, tsOp, tbl);
if(rowCnt == null) {
return null;
}
oneRow.add(HiveDecimal.create(((ExprNodeConstantDesc) aggr.getParameters().get(0))
.getValue().toString()).multiply(HiveDecimal.create(rowCnt)));
ois.add(PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(
PrimitiveCategory.DECIMAL));
}
else if (aggr.getGenericUDAFName().equals(GenericUDAFCount.class.getAnnotation(
Description.class).name())) {
Long rowCnt = 0L;
if ((aggr.getParameters().isEmpty() || aggr.getParameters().get(0) instanceof
ExprNodeConstantDesc)) {
// Its either count (*) or count(1) case
rowCnt = getRowCnt(pctx, tsOp, tbl);
if(rowCnt == null) {
return null;
}
} else {
// Its count(col) case
if (!(aggr.getParameters().get(0) instanceof ExprNodeColumnDesc)) {
// this is weird, we got expr or something in there, bail out
Log.debug("Unexpected expression : " + aggr.getParameters().get(0));
return null;
}
ExprNodeColumnDesc desc = (ExprNodeColumnDesc)aggr.getParameters().get(0);
String colName = desc.getColumn();
StatType type = getType(desc.getTypeString());
if(!tbl.isPartitioned()) {
if (!StatsSetupConst.areStatsUptoDate(tbl.getParameters())) {
Log.debug("Stats for table : " + tbl.getTableName() + " are not upto date.");
return null;
}
rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT));
if (rowCnt < 1) {
Log.debug("Table doesn't have upto date stats " + tbl.getTableName());
return null;
}
List<ColumnStatisticsObj> stats = hive.getMSC().getTableColumnStatistics(
tbl.getDbName(),tbl.getTableName(), Lists.newArrayList(colName));
if (stats.isEmpty()) {
Log.debug("No stats for " + tbl.getTableName() + " column " + colName);
return null;
}
Long nullCnt = getNullcountFor(type, stats.get(0).getStatsData());
if (null == nullCnt) {
Log.debug("Unsupported type: " + desc.getTypeString() + " encountered in " +
"metadata optimizer for column : " + colName);
return null;
} else {
rowCnt -= nullCnt;
}
} else {
Set<Partition> parts = pctx.getPrunedPartitions(
tsOp.getConf().getAlias(), tsOp).getPartitions();
for (Partition part : parts) {
if (!StatsSetupConst.areStatsUptoDate(part.getParameters())) {
Log.debug("Stats for part : " + part.getSpec() + " are not upto date.");
return null;
}
Long partRowCnt = Long.parseLong(part.getParameters()
.get(StatsSetupConst.ROW_COUNT));
if (partRowCnt < 1) {
Log.debug("Partition doesn't have upto date stats " + part.getSpec());
return null;
}
rowCnt += partRowCnt;
}
Collection<List<ColumnStatisticsObj>> result =
verifyAndGetPartStats(hive, tbl, colName, parts);
if (result == null) {
return null; // logging inside
}
for (List<ColumnStatisticsObj> statObj : result) {
ColumnStatisticsData statData = validateSingleColStat(statObj);
if (statData == null) return null;
Long nullCnt = getNullcountFor(type, statData);
if (nullCnt == null) {
Log.debug("Unsupported type: " + desc.getTypeString() + " encountered in " +
"metadata optimizer for column : " + colName);
return null;
} else {
rowCnt -= nullCnt;
}
}
}
}
oneRow.add(rowCnt);
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
} else if (aggr.getGenericUDAFName().equals(GenericUDAFMax.class.getAnnotation(
Description.class).name())) {
ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc)aggr.getParameters().get(0);
String colName = colDesc.getColumn();
StatType type = getType(colDesc.getTypeString());
if(!tbl.isPartitioned()) {
if (!StatsSetupConst.areStatsUptoDate(tbl.getParameters())) {
Log.debug("Stats for table : " + tbl.getTableName() + " are not upto date.");
return null;
}
List<ColumnStatisticsObj> stats = hive.getMSC().getTableColumnStatistics(
tbl.getDbName(),tbl.getTableName(), Lists.newArrayList(colName));
if (stats.isEmpty()) {
Log.debug("No stats for " + tbl.getTableName() + " column " + colName);
return null;
}
ColumnStatisticsData statData = stats.get(0).getStatsData();
switch (type) {
case Integeral:
oneRow.add(statData.getLongStats().getHighValue());
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
break;
case Double:
oneRow.add(statData.getDoubleStats().getHighValue());
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
break;
default:
// unsupported type
Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
"metadata optimizer for column : " + colName);
return null;
}
} else {
Set<Partition> parts = pctx.getPrunedPartitions(
tsOp.getConf().getAlias(), tsOp).getPartitions();
switch (type) {
case Integeral: {
long maxVal = Long.MIN_VALUE;
Collection<List<ColumnStatisticsObj>> result =
verifyAndGetPartStats(hive, tbl, colName, parts);
if (result == null) {
return null; // logging inside
}
for (List<ColumnStatisticsObj> statObj : result) {
ColumnStatisticsData statData = validateSingleColStat(statObj);
if (statData == null) return null;
long curVal = statData.getLongStats().getHighValue();
maxVal = Math.max(maxVal, curVal);
}
oneRow.add(maxVal);
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
break;
}
case Double: {
double maxVal = Double.MIN_VALUE;
Collection<List<ColumnStatisticsObj>> result =
verifyAndGetPartStats(hive, tbl, colName, parts);
if (result == null) {
return null; // logging inside
}
for (List<ColumnStatisticsObj> statObj : result) {
ColumnStatisticsData statData = validateSingleColStat(statObj);
if (statData == null) return null;
double curVal = statData.getDoubleStats().getHighValue();
maxVal = Math.max(maxVal, curVal);
}
oneRow.add(maxVal);
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
break;
}
default:
Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
"metadata optimizer for column : " + colName);
return null;
}
}
} else if (aggr.getGenericUDAFName().equals(GenericUDAFMin.class.getAnnotation(
Description.class).name())) {
ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc)aggr.getParameters().get(0);
String colName = colDesc.getColumn();
StatType type = getType(colDesc.getTypeString());
if (!tbl.isPartitioned()) {
if (!StatsSetupConst.areStatsUptoDate(tbl.getParameters())) {
Log.debug("Stats for table : " + tbl.getTableName() + " are not upto date.");
return null;
}
ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics(
tbl.getDbName(), tbl.getTableName(), Lists.newArrayList(colName))
.get(0).getStatsData();
switch (type) {
case Integeral:
oneRow.add(statData.getLongStats().getLowValue());
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
break;
case Double:
oneRow.add(statData.getDoubleStats().getLowValue());
ois.add(PrimitiveObjectInspectorFactory.
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
break;
default: // unsupported type
Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " +
"metadata optimizer for column : " + colName);
return null;
}
} else {
Set<Partition> parts = pctx.getPrunedPartitions(tsOp.getConf().getAlias(), tsOp).getPartitions();
switch(type) {
case Integeral: {
long minVal = Long.MAX_VALUE;
Collection<List<ColumnStatisticsObj>> result =
verifyAndGetPartStats(hive, tbl, colName, parts);