@SuppressWarnings("nls")
private void genMapRedTasks(QB qb) throws SemanticException {
FetchWork fetch = null;
List<Task<? extends Serializable>> mvTask = new ArrayList<Task<? extends Serializable>>();
FetchTask fetchTask = null;
QBParseInfo qbParseInfo = qb.getParseInfo();
// Does this query need reduce job
if (qb.isSelectStarQuery() && qbParseInfo.getDestToClusterBy().isEmpty()
&& qbParseInfo.getDestToDistributeBy().isEmpty()
&& qbParseInfo.getDestToOrderBy().isEmpty()
&& qbParseInfo.getDestToSortBy().isEmpty()) {
boolean noMapRed = false;
Iterator<Map.Entry<String, Table>> iter = qb.getMetaData()
.getAliasToTable().entrySet().iterator();
Table tab = (iter.next()).getValue();
if (!tab.isPartitioned()) {
if (qbParseInfo.getDestToWhereExpr().isEmpty()) {
fetch = new FetchWork(tab.getPath().toString(), Utilities
.getTableDesc(tab), qb.getParseInfo().getOuterQueryLimit());
noMapRed = true;
inputs.add(new ReadEntity(tab));
}
} else {
if (topOps.size() == 1) {
TableScanOperator ts = (TableScanOperator) topOps.values().toArray()[0];
// check if the pruner only contains partition columns
if (PartitionPruner.onlyContainsPartnCols(topToTable.get(ts),
opToPartPruner.get(ts))) {
PrunedPartitionList partsList = null;
try {
partsList = opToPartList.get(ts);
if (partsList == null) {
partsList = PartitionPruner.prune(topToTable.get(ts),
opToPartPruner.get(ts), conf, (String) topOps.keySet()
.toArray()[0], prunedPartitions);
opToPartList.put(ts, partsList);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
// If there is any unknown partition, create a map-reduce job for
// the filter to prune correctly
if ((partsList.getUnknownPartns().size() == 0)) {
List<String> listP = new ArrayList<String>();
List<PartitionDesc> partP = new ArrayList<PartitionDesc>();
Set<Partition> parts = partsList.getConfirmedPartns();
Iterator<Partition> iterParts = parts.iterator();
while (iterParts.hasNext()) {
Partition part = iterParts.next();
listP.add(part.getPartitionPath().toString());
try {
partP.add(Utilities.getPartitionDesc(part));
} catch (HiveException e) {
throw new SemanticException(e.getMessage(), e);
}
inputs.add(new ReadEntity(part));
}
fetch = new FetchWork(listP, partP, qb.getParseInfo()
.getOuterQueryLimit());
noMapRed = true;
}
}
}
}
if (noMapRed) {
if (fetch.getTblDesc() != null) {
PlanUtils.configureInputJobPropertiesForStorageHandler(
fetch.getTblDesc());
} else if ( (fetch.getPartDesc() != null) && (!fetch.getPartDesc().isEmpty())){
PartitionDesc pd0 = fetch.getPartDesc().get(0);
TableDesc td = pd0.getTableDesc();
if ((td != null)&&(td.getProperties() != null)
&& td.getProperties().containsKey(
org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_STORAGE)){
PlanUtils.configureInputJobPropertiesForStorageHandler(td);
}
}
fetchTask = (FetchTask) TaskFactory.get(fetch, conf);
setFetchTask(fetchTask);
// remove root tasks if any
rootTasks.clear();
return;
}
}
// determine the query qualifies reduce input size for LIMIT
// The query only qualifies when there are only one top operator
// and there is no transformer or UDTF and no block sampling
// is used.
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVELIMITOPTENABLE)
&& ctx.getTryCount() == 0 && topOps.size() == 1
&& !globalLimitCtx.ifHasTransformOrUDTF() &&
nameToSplitSample.isEmpty()) {
// Here we recursively check:
// 1. whether there are exact one LIMIT in the query
// 2. whether there is no aggregation, group-by, distinct, sort by,
// distributed by, or table sampling in any of the sub-query.
// The query only qualifies if both conditions are satisfied.
//
// Example qualified queries:
// CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
// INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
// FROM ... LIMIT...
// SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
//
Integer tempGlobalLimit = checkQbpForGlobalLimit(qb);
// query qualify for the optimization
if (tempGlobalLimit != null && tempGlobalLimit != 0) {
TableScanOperator ts = (TableScanOperator) topOps.values().toArray()[0];
Table tab = topToTable.get(ts);
if (!tab.isPartitioned()) {
if (qbParseInfo.getDestToWhereExpr().isEmpty()) {
globalLimitCtx.enableOpt(tempGlobalLimit);
}
} else {
// check if the pruner only contains partition columns
if (PartitionPruner.onlyContainsPartnCols(tab,
opToPartPruner.get(ts))) {
PrunedPartitionList partsList = null;
try {
partsList = opToPartList.get(ts);
if (partsList == null) {
partsList = PartitionPruner.prune(tab,
opToPartPruner.get(ts), conf, (String) topOps.keySet()
.toArray()[0], prunedPartitions);
opToPartList.put(ts, partsList);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
// If there is any unknown partition, create a map-reduce job for
// the filter to prune correctly
if ((partsList.getUnknownPartns().size() == 0)) {
globalLimitCtx.enableOpt(tempGlobalLimit);
}
}
}
if (globalLimitCtx.isEnable()) {
LOG.info("Qualify the optimize that reduces input size for 'limit' for limit "
+ globalLimitCtx.getGlobalLimit());
}
}
}
// In case of a select, use a fetch task instead of a move task
if (qb.getIsQuery()) {
if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg());
}
String cols = loadFileWork.get(0).getColumns();
String colTypes = loadFileWork.get(0).getColumnTypes();
String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
TableDesc resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat);
fetch = new FetchWork(new Path(loadFileWork.get(0).getSourceDir()).toString(),
resultTab, qb.getParseInfo().getOuterQueryLimit());
fetchTask = (FetchTask) TaskFactory.get(fetch, conf);
setFetchTask(fetchTask);
// For the FetchTask, the limit optimiztion requires we fetch all the rows
// in memory and count how many rows we get. It's not practical if the
// limit factor is too big
int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit
+ ". Doesn't qualify limit optimiztion.");
globalLimitCtx.disableOpt();
}
} else {
for (LoadTableDesc ltd : loadTableWork) {
Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false),
conf);
mvTask.add(tsk);
// Check to see if we are stale'ing any indexes and auto-update them if we want
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf);
try {
List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();
for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
tsk.addDependentTask(updateTask);
}
} catch (HiveException e) {
console.printInfo("WARNING: could not auto-update stale indexes, indexes are not in of sync");
}
}
}
boolean oneLoadFile = true;
for (LoadFileDesc lfd : loadFileWork) {
if (qb.isCTAS()) {
assert (oneLoadFile); // should not have more than 1 load file for
// CTAS
// make the movetask's destination directory the table's destination.
String location = qb.getTableDesc().getLocation();
if (location == null) {
// get the table's default location
Table dumpTable;
Path targetPath;
try {
dumpTable = db.newTable(qb.getTableDesc().getTableName());
if (!db.databaseExists(dumpTable.getDbName())) {
throw new SemanticException("ERROR: The database " + dumpTable.getDbName()
+ " does not exist.");
}
Warehouse wh = new Warehouse(conf);
targetPath = wh.getTablePath(db.getDatabase(dumpTable.getDbName()), dumpTable
.getTableName());
} catch (HiveException e) {
throw new SemanticException(e);
} catch (MetaException e) {
throw new SemanticException(e);
}
location = targetPath.toString();
}
lfd.setTargetDir(location);
oneLoadFile = false;
}
mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false),
conf));
}
}
// generate map reduce plans
ParseContext tempParseContext = getParseContext();
GenMRProcContext procCtx = new GenMRProcContext(
conf,
new HashMap<Operator<? extends Serializable>, Task<? extends Serializable>>(),
new ArrayList<Operator<? extends Serializable>>(), tempParseContext,
mvTask, rootTasks,
new LinkedHashMap<Operator<? extends Serializable>, GenMapRedCtx>(),
inputs, outputs);
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack.
// The dispatcher generates the plan from the operator tree
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp(new String("R1"), "TS%"), new GenMRTableScan1());
opRules.put(new RuleRegExp(new String("R2"), "TS%.*RS%"),
new GenMRRedSink1());
opRules.put(new RuleRegExp(new String("R3"), "RS%.*RS%"),
new GenMRRedSink2());
opRules.put(new RuleRegExp(new String("R4"), "FS%"), new GenMRFileSink1());
opRules.put(new RuleRegExp(new String("R5"), "UNION%"), new GenMRUnion1());
opRules.put(new RuleRegExp(new String("R6"), "UNION%.*RS%"),
new GenMRRedSink3());
opRules.put(new RuleRegExp(new String("R6"), "MAPJOIN%.*RS%"),
new GenMRRedSink4());
opRules.put(new RuleRegExp(new String("R7"), "TS%.*MAPJOIN%"),
MapJoinFactory.getTableScanMapJoin());
opRules.put(new RuleRegExp(new String("R8"), "RS%.*MAPJOIN%"),
MapJoinFactory.getReduceSinkMapJoin());
opRules.put(new RuleRegExp(new String("R9"), "UNION%.*MAPJOIN%"),
MapJoinFactory.getUnionMapJoin());
opRules.put(new RuleRegExp(new String("R10"), "MAPJOIN%.*MAPJOIN%"),
MapJoinFactory.getMapJoinMapJoin());
opRules.put(new RuleRegExp(new String("R11"), "MAPJOIN%SEL%"),
MapJoinFactory.getMapJoin());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(new GenMROperator(), opRules,
procCtx);
GraphWalker ogw = new GenMapRedWalker(disp);
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(topOps.values());
ogw.startWalking(topNodes, null);
// reduce sink does not have any kids - since the plan by now has been
// broken up into multiple
// tasks, iterate over all tasks.
// For each task, go over all operators recursively
for (Task<? extends Serializable> rootTask : rootTasks) {
breakTaskTree(rootTask);
}
// For each task, set the key descriptor for the reducer
for (Task<? extends Serializable> rootTask : rootTasks) {
setKeyDescTaskTree(rootTask);
}
PhysicalContext physicalContext = new PhysicalContext(conf,
getParseContext(), ctx, rootTasks, fetchTask);
PhysicalOptimizer physicalOptimizer = new PhysicalOptimizer(
physicalContext, conf);
physicalOptimizer.optimize();
// For each operator, generate the counters if needed
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEJOBPROGRESS)) {
for (Task<? extends Serializable> rootTask : rootTasks) {
generateCountersTask(rootTask);
}
}
decideExecMode(rootTasks, ctx, globalLimitCtx);
if (qb.isCTAS()) {
// generate a DDL task and make it a dependent task of the leaf
CreateTableDesc crtTblDesc = qb.getTableDesc();
validateCreateTable(crtTblDesc);
// Clear the output for CTAS since we don't need the output from the
// mapredWork, the
// DDLWork at the tail of the chain will have the output
getOutputs().clear();
Task<? extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(
getInputs(), getOutputs(), crtTblDesc), conf);
// find all leaf tasks and make the DDLTask as a dependent task of all of
// them
HashSet<Task<? extends Serializable>> leaves = new HashSet<Task<? extends Serializable>>();
getLeafTasks(rootTasks, leaves);
assert (leaves.size() > 0);
for (Task<? extends Serializable> task : leaves) {
if (task instanceof StatsTask){
//StatsTask require table to already exist
for (Task<? extends Serializable> parentOfStatsTask : task.getParentTasks()){
parentOfStatsTask.addDependentTask(crtTblTask);
}
for (Task<? extends Serializable> parentOfCrtTblTask : crtTblTask.getParentTasks()){
parentOfCrtTblTask.removeDependentTask(task);
}
crtTblTask.addDependentTask(task);
} else {
task.addDependentTask(crtTblTask);
}
}
}
if (globalLimitCtx.isEnable() && fetchTask != null) {
int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
fetchTask.getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
}
if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());