if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
return null;
}
currTask.setTaskTag(Task.COMMON_JOIN);
MapWork currWork = currTask.getWork().getMapWork();
// create conditional work list and task list
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
// create task to aliases mapping and alias to input file mapping for resolver
HashMap<Task<? extends Serializable>, Set<String>> taskToAliases =
new HashMap<Task<? extends Serializable>, Set<String>>();
HashMap<String, ArrayList<String>> pathToAliases = currWork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();
// get parseCtx for this Join Operator
ParseContext parseCtx = physicalContext.getParseContext();
QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp);
// start to generate multiple map join tasks
JoinDesc joinDesc = joinOp.getConf();
if (aliasToSize == null) {
aliasToSize = new HashMap<String, Long>();
}
try {
long aliasTotalKnownInputSize =
getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc
.getConds());
// no table could be the big table; there is no need to convert
if (bigTableCandidates.isEmpty()) {
return null;
}
// if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
// only contain multi-sourced because multi-sourced cannot be hashed or direct readable
bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);
Configuration conf = context.getConf();
// If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
// the threshold size, convert the join into map-join and don't create a conditional task
boolean convertJoinMapJoin = HiveConf.getBoolVar(conf,
HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
int bigTablePosition = -1;
if (convertJoinMapJoin) {
// This is the threshold that the user has specified to fit in mapjoin
long mapJoinSize = HiveConf.getLongVar(conf,
HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
Long bigTableSize = null;
Set<String> aliases = aliasToWork.keySet();
for (int tablePosition : bigTableCandidates) {
Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
continue; // some small alias is not known or too big
}
if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
continue; // prefer right most alias
}
long aliasSize = Utilities.sumOf(aliasToSize, participants);
if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
bigTablePosition = tablePosition;
bigTableSize = aliasSize;
}
}
}
currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
currWork.setJoinTree(joinTree);
if (bigTablePosition >= 0) {
// create map join task and set big table as bigTablePosition
MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);
newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
replaceTask(currTask, newTask, physicalContext);
// Can this task be merged with the child task. This can happen if a big table is being
// joined with multiple small tables on different keys
if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
}
return newTask;
}
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
// this table cannot be big table
if (!bigTableCandidates.contains(pos)) {
continue;
}
// deep copy a new mapred work from xml
// Once HIVE-4396 is in, it would be faster to use a cheaper method to clone the plan
MapredWork newWork = Utilities.clonePlan(currTask.getWork());
// create map join task and set big table as i
MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);
Operator<?> startOp = joinOp.getParentOperators().get(pos);
Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
continue;
}
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
e.printStackTrace();
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.setOpParseCtxMap(null);
currWork.setJoinTree(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
cndTsk.setListTasks(listTasks);