// Right now the work graph is pretty simple. If there is no
// Preceding work we have a root and will generate a map
// vertex. If there is a preceding work we will generate
// a reduce vertex
BaseWork work;
if (context.rootToWorkMap.containsKey(root)) {
// having seen the root operator before means there was a branch in the
// operator graph. There's typically two reasons for that: a) mux/demux
// b) multi insert. Mux/Demux will hit the same leaf again, multi insert
// will result into a vertex with multiple FS or RS operators.
// At this point we don't have to do anything special in this case. Just
// run through the regular paces w/o creating a new task.
work = context.rootToWorkMap.get(root);
} else {
// create a new vertex
if (context.preceedingWork == null) {
work = utils.createMapWork(context, root, tezWork, null);
} else {
work = utils.createReduceWork(context, root, tezWork);
}
context.rootToWorkMap.put(root, work);
}
if (!context.childToWorkMap.containsKey(operator)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.childToWorkMap.put(operator, workItems);
} else {
context.childToWorkMap.get(operator).add(work);
}
// remember which mapjoin operator links with which work
if (!context.currentMapJoinOperators.isEmpty()) {
for (MapJoinOperator mj: context.currentMapJoinOperators) {
LOG.debug("Processing map join: " + mj);
// remember the mapping in case we scan another branch of the
// mapjoin later
if (!context.mapJoinWorkMap.containsKey(mj)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.mapJoinWorkMap.put(mj, workItems);
} else {
context.mapJoinWorkMap.get(mj).add(work);
}
/*
* this happens in case of map join operations.
* The tree looks like this:
*
* RS <--- we are here perhaps
* |
* MapJoin
* / \
* RS TS
* /
* TS
*
* If we are at the RS pointed above, and we may have already visited the
* RS following the TS, we have already generated work for the TS-RS.
* We need to hook the current work to this generated work.
*/
if (context.linkOpWithWorkMap.containsKey(mj)) {
Map<BaseWork,TezEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
if (linkWorkMap != null) {
if (context.linkChildOpWithDummyOp.containsKey(mj)) {
for (Operator<?> dummy: context.linkChildOpWithDummyOp.get(mj)) {
work.addDummyOp((HashTableDummyOperator) dummy);
}
}
for (Entry<BaseWork,TezEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
BaseWork parentWork = parentWorkMap.getKey();
LOG.debug("connecting "+parentWork.getName()+" with "+work.getName());
TezEdgeProperty edgeProp = parentWorkMap.getValue();
tezWork.connect(parentWork, work, edgeProp);
// need to set up output name for reduce sink now that we know the name
// of the downstream work
for (ReduceSinkOperator r:
context.linkWorkWithReduceSinkMap.get(parentWork)) {
if (r.getConf().getOutputName() != null) {
LOG.debug("Cloning reduce sink for multi-child broadcast edge");
// we've already set this one up. Need to clone for the next work.
r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(
(ReduceSinkDesc)r.getConf().clone(), r.getParentOperators());
context.clonedReduceSinks.add(r);
}
r.getConf().setOutputName(work.getName());
context.connectedReduceSinks.add(r);
}
}
}
}
}
// clear out the set. we don't need it anymore.
context.currentMapJoinOperators.clear();
}
// This is where we cut the tree as described above. We also remember that
// we might have to connect parent work with this work later.
for (Operator<?> parent: new ArrayList<Operator<?>>(root.getParentOperators())) {
context.leafOperatorToFollowingWork.put(parent, work);
LOG.debug("Removing " + parent + " as parent from " + root);
root.removeParent(parent);
}
if (!context.currentUnionOperators.isEmpty()) {
// if there are union all operators we need to add the work to the set
// of union operators.
UnionWork unionWork;
if (context.unionWorkMap.containsKey(operator)) {
// we've seen this terminal before and have created a union work object.
// just need to add this work to it. There will be no children of this one
// since we've passed this operator before.
assert operator.getChildOperators().isEmpty();
unionWork = (UnionWork) context.unionWorkMap.get(operator);
} else {
// first time through. we need to create a union work object and add this
// work to it. Subsequent work should reference the union and not the actual
// work.
unionWork = utils.createUnionWork(context, operator, tezWork);
}
// finally hook everything up
LOG.debug("Connecting union work ("+unionWork+") with work ("+work+")");
TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.CONTAINS);
tezWork.connect(unionWork, work, edgeProp);
unionWork.addUnionOperators(context.currentUnionOperators);
context.currentUnionOperators.clear();
context.workWithUnionOperators.add(work);
work = unionWork;
}
// We're scanning a tree from roots to leaf (this is not technically
// correct, demux and mux operators might form a diamond shape, but
// we will only scan one path and ignore the others, because the
// diamond shape is always contained in a single vertex). The scan
// is depth first and because we remove parents when we pack a pipeline
// into a vertex we will never visit any node twice. But because of that
// we might have a situation where we need to connect 'work' that comes after
// the 'work' we're currently looking at.
//
// Also note: the concept of leaf and root is reversed in hive for historical
// reasons. Roots are data sources, leaves are data sinks. I know.
if (context.leafOperatorToFollowingWork.containsKey(operator)) {
BaseWork followingWork = context.leafOperatorToFollowingWork.get(operator);
LOG.debug("Second pass. Leaf operator: "+operator
+" has common downstream work:"+followingWork);
// need to add this branch to the key + value info