/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRMapJoinCtx;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext;
import org.apache.hadoop.hive.ql.parse.ErrorMsg;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.TableDesc;
/**
* Operator factory for MapJoin processing.
*/
public final class MapJoinFactory {
public static int getPositionParent(AbstractMapJoinOperator<? extends MapJoinDesc> op, Stack<Node> stack) {
int pos = 0;
int size = stack.size();
assert size >= 2 && stack.get(size - 1) == op;
Operator<? extends Serializable> parent = (Operator<? extends Serializable>) stack
.get(size - 2);
List<Operator<? extends Serializable>> parOp = op.getParentOperators();
pos = parOp.indexOf(parent);
assert pos < parOp.size();
return pos;
}
/**
* TableScan followed by MapJoin.
*/
public static class TableScanMapJoin implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
AbstractMapJoinOperator<MapJoinDesc> mapJoin = (AbstractMapJoinOperator<MapJoinDesc>) nd;
GenMRProcContext ctx = (GenMRProcContext) procCtx;
// find the branch on which this processor was invoked
int pos = getPositionParent(mapJoin, stack);
Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx
.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get(
pos));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork currPlan = (MapredWork) currTask.getWork();
Operator<? extends Serializable> currTopOp = mapredCtx.getCurrTopOp();
String currAliasId = mapredCtx.getCurrAliasId();
Operator<? extends Serializable> reducer = mapJoin;
HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = ctx
.getOpTaskMap();
Task<? extends Serializable> opMapTask = opTaskMap.get(reducer);
ctx.setCurrTopOp(currTopOp);
ctx.setCurrAliasId(currAliasId);
ctx.setCurrTask(currTask);
// If the plan for this reducer does not exist, initialize the plan
if (opMapTask == null) {
assert currPlan.getReducer() == null;
GenMapRedUtils.initMapJoinPlan(mapJoin, ctx, false, false, false, pos);
} else {
// The current plan can be thrown away after being merged with the
// original plan
GenMapRedUtils.joinPlan(mapJoin, null, opMapTask, ctx, pos, false,
false, false);
currTask = opMapTask;
ctx.setCurrTask(currTask);
}
mapCurrCtx.put(mapJoin, new GenMapRedCtx(ctx.getCurrTask(), ctx
.getCurrTopOp(), ctx.getCurrAliasId()));
return null;
}
}
/**
* ReduceSink followed by MapJoin.
*/
public static class ReduceSinkMapJoin implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
AbstractMapJoinOperator<MapJoinDesc> mapJoin = (AbstractMapJoinOperator<MapJoinDesc>) nd;
GenMRProcContext opProcCtx = (GenMRProcContext) procCtx;
ParseContext parseCtx = opProcCtx.getParseCtx();
MapredWork cplan = GenMapRedUtils.getMapRedWork(parseCtx);
Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx
.getConf());
Task<? extends Serializable> currTask = opProcCtx.getCurrTask();
// find the branch on which this processor was invoked
int pos = getPositionParent(mapJoin, stack);
boolean local = (pos == ((mapJoin.getConf())).getPosBigTable()) ? false
: true;
GenMapRedUtils.splitTasks(mapJoin, currTask, redTask, opProcCtx, false,
local, pos);
currTask = opProcCtx.getCurrTask();
HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = opProcCtx
.getOpTaskMap();
Task<? extends Serializable> opMapTask = opTaskMap.get(mapJoin);
// If the plan for this reducer does not exist, initialize the plan
if (opMapTask == null) {
assert cplan.getReducer() == null;
opTaskMap.put(mapJoin, currTask);
opProcCtx.setCurrMapJoinOp(null);
} else {
// The current plan can be thrown away after being merged with the
// original plan
GenMapRedUtils.joinPlan(mapJoin, currTask, opMapTask, opProcCtx, pos,
false, false, false);
currTask = opMapTask;
opProcCtx.setCurrTask(currTask);
}
return null;
}
}
/**
* MapJoin followed by Select.
*/
public static class MapJoin implements NodeProcessor {
/**
* Create a task by splitting the plan below the join. The reason, we have
* to do so in the processing of Select and not MapJoin is due to the
* walker. While processing a node, it is not safe to alter its children
* because that will decide the course of the walk. It is perfectly fine to
* muck around with its parents though, since those nodes have already been
* visited.
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
SelectOperator sel = (SelectOperator) nd;
AbstractMapJoinOperator<MapJoinDesc> mapJoin = (AbstractMapJoinOperator<MapJoinDesc>) sel.getParentOperators().get(
0);
assert sel.getParentOperators().size() == 1;
GenMRProcContext ctx = (GenMRProcContext) procCtx;
ParseContext parseCtx = ctx.getParseCtx();
// is the mapjoin followed by a reducer
List<AbstractMapJoinOperator<? extends MapJoinDesc>> listMapJoinOps = parseCtx
.getListMapJoinOpsNoReducer();
if (listMapJoinOps.contains(mapJoin)) {
ctx.setCurrAliasId(null);
ctx.setCurrTopOp(null);
Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx
.getMapCurrCtx();
mapCurrCtx.put((Operator<? extends Serializable>) nd, new GenMapRedCtx(
ctx.getCurrTask(), null, null));
return null;
}
ctx.setCurrMapJoinOp(mapJoin);
Task<? extends Serializable> currTask = ctx.getCurrTask();
GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(mapJoin);
if (mjCtx == null) {
mjCtx = new GenMRMapJoinCtx();
ctx.setMapJoinCtx(mapJoin, mjCtx);
}
MapredWork mjPlan = GenMapRedUtils.getMapRedWork(parseCtx);
Task<? extends Serializable> mjTask = TaskFactory.get(mjPlan, parseCtx
.getConf());
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils
.getFieldSchemasFromRowSchema(mapJoin.getSchema(), "temporarycol"));
// generate the temporary file
Context baseCtx = parseCtx.getContext();
String taskTmpDir = baseCtx.getMRTmpFileURI();
// Add the path to alias mapping
mjCtx.setTaskTmpDir(taskTmpDir);
mjCtx.setTTDesc(tt_desc);
mjCtx.setRootMapJoinOp(sel);
sel.setParentOperators(null);
// Create a file sink operator for this file name
Operator<? extends Serializable> fs_op = OperatorFactory.get(
new FileSinkDesc(taskTmpDir, tt_desc, parseCtx.getConf().getBoolVar(
HiveConf.ConfVars.COMPRESSINTERMEDIATE)), mapJoin.getSchema());
assert mapJoin.getChildOperators().size() == 1;
mapJoin.getChildOperators().set(0, fs_op);
List<Operator<? extends Serializable>> parentOpList = new ArrayList<Operator<? extends Serializable>>();
parentOpList.add(mapJoin);
fs_op.setParentOperators(parentOpList);
currTask.addDependentTask(mjTask);
ctx.setCurrTask(mjTask);
ctx.setCurrAliasId(null);
ctx.setCurrTopOp(null);
Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx
.getMapCurrCtx();
mapCurrCtx.put((Operator<? extends Serializable>) nd, new GenMapRedCtx(
ctx.getCurrTask(), null, null));
return null;
}
}
/**
* MapJoin followed by MapJoin.
*/
public static class MapJoinMapJoin implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
AbstractMapJoinOperator<? extends MapJoinDesc> mapJoin = (AbstractMapJoinOperator<? extends MapJoinDesc>) nd;
GenMRProcContext ctx = (GenMRProcContext) procCtx;
ctx.getParseCtx();
AbstractMapJoinOperator<? extends MapJoinDesc> oldMapJoin = ctx.getCurrMapJoinOp();
GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(mapJoin);
if (mjCtx != null) {
mjCtx.setOldMapJoin(oldMapJoin);
} else {
ctx.setMapJoinCtx(mapJoin, new GenMRMapJoinCtx(null, null, null,
oldMapJoin));
}
ctx.setCurrMapJoinOp(mapJoin);
// find the branch on which this processor was invoked
int pos = getPositionParent(mapJoin, stack);
Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx
.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get(
pos));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork currPlan = (MapredWork) currTask.getWork();
mapredCtx.getCurrAliasId();
Operator<? extends Serializable> reducer = mapJoin;
HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = ctx
.getOpTaskMap();
Task<? extends Serializable> opMapTask = opTaskMap.get(reducer);
ctx.setCurrTask(currTask);
// If the plan for this reducer does not exist, initialize the plan
if (opMapTask == null) {
assert currPlan.getReducer() == null;
GenMapRedUtils.initMapJoinPlan(mapJoin, ctx, true, false, false, pos);
} else {
// The current plan can be thrown away after being merged with the
// original plan
GenMapRedUtils.joinPlan(mapJoin, currTask, opMapTask, ctx, pos, false,
true, false);
currTask = opMapTask;
ctx.setCurrTask(currTask);
}
mapCurrCtx.put(mapJoin, new GenMapRedCtx(ctx.getCurrTask(), null, null));
return null;
}
}
/**
* Union followed by MapJoin.
*/
public static class UnionMapJoin implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) procCtx;
ParseContext parseCtx = ctx.getParseCtx();
UnionProcContext uCtx = parseCtx.getUCtx();
// union was map only - no special processing needed
if (uCtx.isMapOnlySubq()) {
return (new TableScanMapJoin())
.process(nd, stack, procCtx, nodeOutputs);
}
UnionOperator currUnion = ctx.getCurrUnionOp();
assert currUnion != null;
ctx.getUnionTask(currUnion);
AbstractMapJoinOperator<MapJoinDesc> mapJoin = (AbstractMapJoinOperator<MapJoinDesc>) nd;
// find the branch on which this processor was invoked
int pos = getPositionParent(mapJoin, stack);
Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx
.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get(
pos));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork currPlan = (MapredWork) currTask.getWork();
Operator<? extends Serializable> reducer = mapJoin;
HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = ctx
.getOpTaskMap();
Task<? extends Serializable> opMapTask = opTaskMap.get(reducer);
// union result cannot be a map table
boolean local = (pos == (mapJoin.getConf()).getPosBigTable()) ? false
: true;
if (local) {
throw new SemanticException(ErrorMsg.INVALID_MAPJOIN_TABLE.getMsg());
}
// If the plan for this reducer does not exist, initialize the plan
if (opMapTask == null) {
assert currPlan.getReducer() == null;
ctx.setCurrMapJoinOp(mapJoin);
GenMapRedUtils.initMapJoinPlan(mapJoin, ctx, true, true, false, pos);
ctx.setCurrUnionOp(null);
} else {
// The current plan can be thrown away after being merged with the
// original plan
Task<? extends Serializable> uTask = ctx.getUnionTask(
ctx.getCurrUnionOp()).getUTask();
if (uTask.getId().equals(opMapTask.getId())) {
GenMapRedUtils.joinPlan(mapJoin, null, opMapTask, ctx, pos, false,
false, true);
} else {
GenMapRedUtils.joinPlan(mapJoin, uTask, opMapTask, ctx, pos, false,
false, true);
}
currTask = opMapTask;
ctx.setCurrTask(currTask);
}
mapCurrCtx.put(mapJoin, new GenMapRedCtx(ctx.getCurrTask(), ctx
.getCurrTopOp(), ctx.getCurrAliasId()));
return null;
}
}
public static NodeProcessor getTableScanMapJoin() {
return new TableScanMapJoin();
}
public static NodeProcessor getUnionMapJoin() {
return new UnionMapJoin();
}
public static NodeProcessor getReduceSinkMapJoin() {
return new ReduceSinkMapJoin();
}
public static NodeProcessor getMapJoin() {
return new MapJoin();
}
public static NodeProcessor getMapJoinMapJoin() {
return new MapJoinMapJoin();
}
private MapJoinFactory() {
// prevent instantiation
}
}