if (!(childTask instanceof MapRedTask)) {
// Nothing to do if it is not a MapReduce task.
return;
}
MapRedTask childMapRedTask = (MapRedTask) childTask;
MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
MapWork childMapWork = childMapRedTask.getWork().getMapWork();
Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork =
mapJoinMapWork.getAliasToWork();
if (mapJoinAliasToWork.size() > 1) {
// Do not merge if the MapredWork of MapJoin has multiple input aliases.
return;
}
Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry =
mapJoinAliasToWork.entrySet().iterator().next();
String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
TableScanOperator mapJoinTaskTableScanOperator =
OperatorUtils.findSingleOperator(
mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
if (mapJoinTaskTableScanOperator == null) {
throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
" operator as the work associated with alias " + mapJoinAlias +
". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
}
FileSinkOperator mapJoinTaskFileSinkOperator =
OperatorUtils.findSingleOperator(
mapJoinTaskTableScanOperator, FileSinkOperator.class);
if (mapJoinTaskFileSinkOperator == null) {
throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() +
" operator at the last operator of the MapJoin Task.");
}
// The mapJoinTaskFileSinkOperator writes to a different directory
String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
if (childMRAliases == null || childMRAliases.size() != 1) {
return;
}
String childMRAlias = childMRAliases.get(0);
MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapLocalWork();
MapredLocalWork childLocalWork = childMapWork.getMapLocalWork();
if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
(childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
// Right now, we do not handle the case that either of them is bucketed.
// We should relax this constraint with a follow-up jira.
return;
}
// We need to check if the total size of local tables is under the limit.
// At here, we are using a strong condition, which is the total size of
// local tables used by all input paths. Actually, we can relax this condition
// to check the total size of local tables for every input path.
// Example:
// UNION_ALL
// / \
// / \
// / \
// / \
// MapJoin1 MapJoin2
// / | \ / | \
// / | \ / | \
// Big1 S1 S2 Big2 S3 S4
// In this case, we have two MapJoins, MapJoin1 and MapJoin2. Big1 and Big2 are two
// big tables, and S1, S2, S3, and S4 are four small tables. Hash tables of S1 and S2
// will only be used by Map tasks processing Big1. Hash tables of S3 and S4 will only
// be used by Map tasks processing Big2. If Big1!=Big2, we should only check if the size
// of S1 + S2 is under the limit, and if the size of S3 + S4 is under the limit.
// But, right now, we are checking the size of S1 + S2 + S3 + S4 is under the limit.
// If Big1=Big2, we will only scan a path once. So, MapJoin1 and MapJoin2 will be executed
// in the same Map task. In this case, we need to make sure the size of S1 + S2 + S3 + S4
// is under the limit.
if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){
// The total size of local tables may not be under
// the limit after we merge mapJoinLocalWork and childLocalWork.
// Do not merge.
return;
}
TableScanOperator childMRTaskTableScanOperator =
OperatorUtils.findSingleOperator(
childMapWork.getAliasToWork().get(childMRAlias), TableScanOperator.class);
if (childMRTaskTableScanOperator == null) {
throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
" operator as the work associated with alias " + childMRAlias +
". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
}
List<Operator<? extends OperatorDesc>> parentsInMapJoinTask =
mapJoinTaskFileSinkOperator.getParentOperators();
List<Operator<? extends OperatorDesc>> childrenInChildMRTask =
childMRTaskTableScanOperator.getChildOperators();
if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
// Do not merge if we do not know how to connect two operator trees.
return;
}
// Step 2: Merge mapJoinTask into the Map-side of its child.
// Step 2.1: Connect the operator trees of two MapRedTasks.
Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
// Step 2.2: Replace the corresponding part childMRWork's MapWork.
GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias, mapJoinMapWork, childMapWork);
// Step 2.3: Fill up stuff in local work
if (mapJoinLocalWork != null) {
if (childLocalWork == null) {
childMapWork.setMapLocalWork(mapJoinLocalWork);
} else {
childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
}
}
// Step 2.4: Remove this MapJoin task
List<Task<? extends Serializable>> parentTasks = mapJoinTask.getParentTasks();
mapJoinTask.setParentTasks(null);
mapJoinTask.setChildTasks(null);
childMapRedTask.getParentTasks().remove(mapJoinTask);
if (parentTasks != null) {
childMapRedTask.getParentTasks().addAll(parentTasks);
for (Task<? extends Serializable> parentTask : parentTasks) {
parentTask.getChildTasks().remove(mapJoinTask);
if (!parentTask.getChildTasks().contains(childMapRedTask)) {
parentTask.getChildTasks().add(childMapRedTask);
}
}
} else {
if (physicalContext.getRootTasks().contains(mapJoinTask)) {
physicalContext.removeFromRootTask(mapJoinTask);
if (childMapRedTask.getParentTasks() != null &&
childMapRedTask.getParentTasks().size() == 0 &&
!physicalContext.getRootTasks().contains(childMapRedTask)) {
physicalContext.addToRootTask(childMapRedTask);
}
}
}
if (childMapRedTask.getParentTasks().size() == 0) {
childMapRedTask.setParentTasks(null);
}
}