*
* @param subQuery
* @return
*/
public static int calculateShuffleOutputNum(SubQuery subQuery, DataChannel channel) {
TajoConf conf = subQuery.context.getConf();
MasterPlan masterPlan = subQuery.getMasterPlan();
ExecutionBlock parent = masterPlan.getParent(subQuery.getBlock());
GroupbyNode grpNode = null;
if (parent != null) {
grpNode = PlannerUtil.findMostBottomNode(parent.getPlan(), NodeType.GROUP_BY);
}
// Is this subquery the first step of join?
if (parent != null && parent.getScanNodes().length == 2) {
List<ExecutionBlock> childs = masterPlan.getChilds(parent);
// for outer
ExecutionBlock outer = childs.get(0);
long outerVolume = getInputVolume(subQuery.masterPlan, subQuery.context, outer);
// for inner
ExecutionBlock inner = childs.get(1);
long innerVolume = getInputVolume(subQuery.masterPlan, subQuery.context, inner);
LOG.info(subQuery.getId() + ", Outer volume: " + Math.ceil((double) outerVolume / 1048576) + "MB, "
+ "Inner volume: " + Math.ceil((double) innerVolume / 1048576) + "MB");
long bigger = Math.max(outerVolume, innerVolume);
int mb = (int) Math.ceil((double) bigger / 1048576);
LOG.info(subQuery.getId() + ", Bigger Table's volume is approximately " + mb + " MB");
int taskNum = (int) Math.ceil((double) mb /
conf.getIntVar(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME));
int totalMem = getClusterTotalMemory(subQuery);
LOG.info(subQuery.getId() + ", Total memory of cluster is " + totalMem + " MB");
int slots = Math.max(totalMem / conf.getIntVar(ConfVars.TASK_DEFAULT_MEMORY), 1);
// determine the number of task
taskNum = Math.min(taskNum, slots);
LOG.info(subQuery.getId() + ", The determined number of join partitions is " + taskNum);
// The shuffle output numbers of join may be inconsistent by execution block order.
// Thus, we need to compare the number with DataChannel output numbers.
// If the number is right, the number and DataChannel output numbers will be consistent.
int outerShuffleOutptNum = 0, innerShuffleOutputNum = 0;
for (DataChannel eachChannel : masterPlan.getOutgoingChannels(outer.getId())) {
outerShuffleOutptNum = Math.max(outerShuffleOutptNum, eachChannel.getShuffleOutputNum());
}
for (DataChannel eachChannel : masterPlan.getOutgoingChannels(inner.getId())) {
innerShuffleOutputNum = Math.max(innerShuffleOutputNum, eachChannel.getShuffleOutputNum());
}
if (outerShuffleOutptNum != innerShuffleOutputNum
&& taskNum != outerShuffleOutptNum
&& taskNum != innerShuffleOutputNum) {
taskNum = Math.max(outerShuffleOutptNum, innerShuffleOutputNum);
}
return taskNum;
// Is this subquery the first step of group-by?
} else if (grpNode != null) {
if (grpNode.getGroupingColumns().length == 0) {
return 1;
} else {
long volume = getInputVolume(subQuery.masterPlan, subQuery.context, subQuery.block);
int mb = (int) Math.ceil((double) volume / 1048576);
LOG.info(subQuery.getId() + ", Table's volume is approximately " + mb + " MB");
// determine the number of task
int taskNumBySize = (int) Math.ceil((double) mb /
conf.getIntVar(ConfVars.DIST_QUERY_GROUPBY_PARTITION_VOLUME));
int totalMem = getClusterTotalMemory(subQuery);
LOG.info(subQuery.getId() + ", Total memory of cluster is " + totalMem + " MB");
int slots = Math.max(totalMem / conf.getIntVar(ConfVars.TASK_DEFAULT_MEMORY), 1);
int taskNum = Math.min(taskNumBySize, slots); //Maximum partitions
LOG.info(subQuery.getId() + ", The determined number of aggregation partitions is " + taskNum);
return taskNum;
}
} else {