public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
// introduce RS and EX before FS. If the operator tree already contains
// RS then ReduceSinkDeDuplication optimization should merge them
FileSinkOperator fsOp = (FileSinkOperator) nd;
LOG.info("Sorted dynamic partitioning optimization kicked in..");
// if not dynamic partitioning then bail out
if (fsOp.getConf().getDynPartCtx() == null) {
LOG.debug("Bailing out of sort dynamic partition optimization as dynamic partitioning context is null");
return null;
}
// if list bucketing then bail out
ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx();
if (lbCtx != null && !lbCtx.getSkewedColNames().isEmpty()
&& !lbCtx.getSkewedColValues().isEmpty()) {
LOG.debug("Bailing out of sort dynamic partition optimization as list bucketing is enabled");
return null;
}
Table destTable = parseCtx.getFsopToTable().get(fsOp);
if (destTable == null) {
LOG.debug("Bailing out of sort dynamic partition optimization as destination table is null");
return null;
}
// if RS is inserted by enforce bucketing or sorting, we need to remove it
// since ReduceSinkDeDuplication will not merge them to single RS.
// RS inserted by enforce bucketing/sorting will have bucketing column in
// reduce sink key whereas RS inserted by this optimization will have
// partition columns followed by bucket number followed by sort columns in
// the reduce sink key. Since both key columns are not prefix subset
// ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs.
// To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting.
if (!removeRSInsertedByEnforceBucketing(fsOp)) {
LOG.debug("Bailing out of sort dynamic partition optimization as some partition columns " +
"got constant folded.");
return null;
}
// unlink connection between FS and its parent
Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0);
fsParent.getChildOperators().clear();
DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx();
int numBuckets = destTable.getNumBuckets();
// if enforce bucketing/sorting is disabled numBuckets will not be set.
// set the number of buckets here to ensure creation of empty buckets
dpCtx.setNumBuckets(numBuckets);
// Get the positions for partition, bucket and sort columns
List<Integer> bucketPositions = getBucketPositions(destTable.getBucketCols(),
destTable.getCols());
ObjectPair<List<Integer>, List<Integer>> sortOrderPositions = getSortPositionsOrder(
destTable.getSortCols(), destTable.getCols());
List<Integer> sortPositions = null;
List<Integer> sortOrder = null;
if (fsOp.getConf().getWriteType() == AcidUtils.Operation.UPDATE ||
fsOp.getConf().getWriteType() == AcidUtils.Operation.DELETE) {
// When doing updates and deletes we always want to sort on the rowid because the ACID
// reader will expect this sort order when doing reads. So
// ignore whatever comes from the table and enforce this sort order instead.
sortPositions = Arrays.asList(0);
sortOrder = Arrays.asList(1); // 1 means asc, could really use enum here in the thrift if
} else {
sortPositions = sortOrderPositions.getFirst();
sortOrder = sortOrderPositions.getSecond();
}
LOG.debug("Got sort order");
for (int i : sortPositions) LOG.debug("sort position " + i);
for (int i : sortOrder) LOG.debug("sort order " + i);
List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema());
List<ColumnInfo> colInfos = parseCtx.getOpParseCtx().get(fsParent).getRowResolver()
.getColumnInfos();
ArrayList<ExprNodeDesc> bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos);
// update file sink descriptor
fsOp.getConf().setMultiFileSpray(false);
fsOp.getConf().setNumFiles(1);
fsOp.getConf().setTotalFiles(1);
// Create ReduceSinkDesc
RowResolver inputRR = parseCtx.getOpParseCtx().get(fsParent).getRowResolver();
ObjectPair<String, RowResolver> pair = copyRowResolver(inputRR);
RowResolver outRR = pair.getSecond();
ArrayList<ColumnInfo> valColInfo = Lists.newArrayList(fsParent.getSchema().getSignature());
ArrayList<ExprNodeDesc> newValueCols = Lists.newArrayList();
Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap();
for (ColumnInfo ci : valColInfo) {
newValueCols.add(new ExprNodeColumnDesc(ci));
colExprMap.put(ci.getInternalName(), newValueCols.get(newValueCols.size() - 1));
}
ReduceSinkDesc rsConf = getReduceSinkDesc(partitionPositions, sortPositions, sortOrder,
newValueCols, bucketColumns, numBuckets, fsParent, fsOp.getConf().getWriteType());
if (!bucketColumns.isEmpty()) {
String tableAlias = outRR.getColumnInfos().get(0).getTabAlias();
ColumnInfo ci = new ColumnInfo(BUCKET_NUMBER_COL_NAME, TypeInfoFactory.stringTypeInfo,
tableAlias, true, true);
outRR.put(tableAlias, BUCKET_NUMBER_COL_NAME, ci);
}
// Create ReduceSink operator
ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(
OperatorFactory.getAndMakeChild(rsConf, new RowSchema(outRR.getColumnInfos()), fsParent),
outRR, parseCtx);
rsOp.setColumnExprMap(colExprMap);
// Create ExtractDesc
ObjectPair<String, RowResolver> exPair = copyRowResolver(outRR);
RowResolver exRR = exPair.getSecond();
ExtractDesc exConf = new ExtractDesc(new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
Utilities.ReduceField.VALUE.toString(), "", false));
// Create Extract Operator
ExtractOperator exOp = (ExtractOperator) putOpInsertMap(
OperatorFactory.getAndMakeChild(exConf, new RowSchema(exRR.getColumnInfos()), rsOp),
exRR, parseCtx);
// link EX to FS
fsOp.getParentOperators().clear();
fsOp.getParentOperators().add(exOp);
exOp.getChildOperators().add(fsOp);
// Set if partition sorted or partition bucket sorted
fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED);
if (bucketColumns.size() > 0) {
fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED);
}
// update partition column info in FS descriptor
ArrayList<ExprNodeDesc> partitionColumns = getPositionsToExprNodes(partitionPositions, rsOp
.getSchema().getSignature());
fsOp.getConf().setPartitionCols(partitionColumns);
LOG.info("Inserted " + rsOp.getOperatorId() + " and " + exOp.getOperatorId()
+ " as parent of " + fsOp.getOperatorId() + " and child of " + fsParent.getOperatorId());
return null;
}