Examples of org.apache.hadoop.hive.ql.exec.mr.MapRedTask

Package org.apache.hadoop.hive.ql.exec.mr

Examples of org.apache.hadoop.hive.ql.exec.mr.MapRedTask

org.apache.hadoop.hive.ql.exec.mr.MapRedTask
Extension of ExecDriver: - can optionally spawn a map-reduce task from a separate jvm - will make last minute adjustments to map-reduce job parameters, viz: * estimating number of reducers * estimating whether job should run locally

        if (!bigTableCandidates.contains(bigTablePosition)) {
          continue;
        }


        // create map join task for the given big table position
        MapRedTask newTask = convertSMBTaskToMapJoinTask(
            currJoinWork, bigTablePosition, newSMBJoinOp, joinTree);


        MapWork mapWork = newTask.getWork().getMapWork();
        Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
        Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);


        long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
        if (aliasKnownSize > 0) {
          long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
          if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
            // this table is not good to be a big table.
            continue;
          }
        }


        // add into conditional task
        listWorks.add(newTask.getWork());
        listTasks.add(newTask);
        newTask.setTaskTag(Task.CONVERTED_MAPJOIN);


        // set up backup task
        newTask.setBackupTask(currTask);
        newTask.setBackupChildrenTasks(currTask.getChildTasks());


        // put the mapping task to aliases
        taskToAliases.put(newTask, aliases);
      }
    } catch (Exception e) {

View Full Code Here

        .getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();


    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
    Operator<? extends OperatorDesc> currTopOp = op;
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(currTopOp);


    for (String alias : parseCtx.getTopOps().keySet()) {
      Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
      if (currOp == op) {
        String currAliasId = alias;
        ctx.setCurrAliasId(currAliasId);
        mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));


        QBParseInfo parseInfo = parseCtx.getQB().getParseInfo();
        if (parseInfo.isAnalyzeCommand()) {
          boolean partialScan = parseInfo.isPartialScanAnalyzeCommand();
          boolean noScan = parseInfo.isNoScanAnalyzeCommand();
          if (inputFormat.equals(OrcInputFormat.class) && (noScan || partialScan)) {


            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any MR or Tez job above this task
            StatsNoJobWork snjWork = new StatsNoJobWork(parseCtx.getQB().getParseInfo().getTableSpec());
            snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(
                HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
            ctx.setCurrTask(snjTask);
            ctx.setCurrTopOp(null);
            ctx.getRootTasks().clear();
            ctx.getRootTasks().add(snjTask);
          } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple MapRedTask followed by a StatsTask.
            // The MR task is just a simple TableScanOperator


            StatsWork statsWork = new StatsWork(parseCtx.getQB().getParseInfo().getTableSpec());
            statsWork.setAggKey(op.getConf().getStatsAggPrefix());
            statsWork.setSourceTask(currTask);
            statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(
                HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
            currTask.addDependentTask(statsTask);
            if (!ctx.getRootTasks().contains(currTask)) {
              ctx.getRootTasks().add(currTask);
            }


            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;

View Full Code Here

    rWork.setReducer(op5);
  }


  private void executePlan() throws Exception {
    String testName = new Exception().getStackTrace()[1].getMethodName();
    MapRedTask mrtask = new MapRedTask();
    DriverContext dctx = new DriverContext ();
    mrtask.setWork(mr);
    mrtask.initialize(conf, null, dctx);
    int exitVal =  mrtask.execute(dctx);


    if (exitVal != 0) {
      LOG.error(testName + " execution failed with exit status: "
          + exitVal);
      assertEquals(true, false);

View Full Code Here

        .getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();


    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
    Operator<? extends OperatorDesc> currTopOp = op;
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(currTopOp);


    for (String alias : parseCtx.getTopOps().keySet()) {
      Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
      if (currOp == op) {
        String currAliasId = alias;
        ctx.setCurrAliasId(currAliasId);
        mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));


        QBParseInfo parseInfo = parseCtx.getQB().getParseInfo();
        if (parseInfo.isAnalyzeCommand()) {
          boolean partialScan = parseInfo.isPartialScanAnalyzeCommand();
          boolean noScan = parseInfo.isNoScanAnalyzeCommand();
          if (inputFormat.equals(OrcInputFormat.class) && (noScan || partialScan)) {


            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any MR or Tez job above this task
            StatsNoJobWork snjWork = new StatsNoJobWork(parseCtx.getQB().getParseInfo().getTableSpec());
            snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(
                HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
            ctx.setCurrTask(snjTask);
            ctx.setCurrTopOp(null);
            ctx.getRootTasks().clear();
            ctx.getRootTasks().add(snjTask);
          } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple MapRedTask followed by a StatsTask.
            // The MR task is just a simple TableScanOperator


            StatsWork statsWork = new StatsWork(parseCtx.getQB().getParseInfo().getTableSpec());
            statsWork.setAggKey(op.getConf().getStatsAggPrefix());
            statsWork.setSourceTask(currTask);
            statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(
                HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
            currTask.addDependentTask(statsTask);
            if (!ctx.getRootTasks().contains(currTask)) {
              ctx.getRootTasks().add(currTask);
            }


            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;

View Full Code Here

      QBJoinTree joinTree)
      throws UnsupportedEncodingException, SemanticException {
    // deep copy a new mapred work
    MapredWork newWork = Utilities.clonePlan(origWork);
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
        .getParseContext().getConf());
    // generate the map join operator; already checked the map join
    MapJoinOperator newMapJoinOp =
        getMapJoinOperator(newTask, newWork, smbJoinOp, joinTree, bigTablePosition);

View Full Code Here

        if (!bigTableCandidates.contains(bigTablePosition)) {
          continue;
        }


        // create map join task for the given big table position
        MapRedTask newTask = convertSMBTaskToMapJoinTask(
            currJoinWork, bigTablePosition, newSMBJoinOp, joinTree);


        MapWork mapWork = newTask.getWork().getMapWork();
        Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
        Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);


        long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
        if (aliasKnownSize > 0) {
          long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
          if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
            // this table is not good to be a big table.
            continue;
          }
        }


        // add into conditional task
        listWorks.add(newTask.getWork());
        listTasks.add(newTask);
        newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
        newTask.setFetchSource(currTask.isFetchSource());


        // set up backup task
        newTask.setBackupTask(currTask);
        newTask.setBackupChildrenTasks(currTask.getChildTasks());


        // put the mapping task to aliases
        taskToAliases.put(newTask, aliases);
      }
    } catch (Exception e) {

View Full Code Here


  // create map join task and set big table as bigTablePosition
  private MapRedTask convertTaskToMapJoinTask(MapredWork newWork, int bigTablePosition)
      throws UnsupportedEncodingException, SemanticException {
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
        .getParseContext().getConf());
    JoinOperator newJoinOp = getJoinOp(newTask);


    // optimize this newWork given the big table position
    MapJoinProcessor.genMapJoinOpAndLocalWork(physicalContext.getParseContext().getConf(),

View Full Code Here

    if (!(childTask instanceof MapRedTask)) {
      // Nothing to do if it is not a MapReduce task.
      return;
    }


    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();


    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork =
        mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
      // Do not merge if the MapredWork of MapJoin has multiple input aliases.
      return;
    }


    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry =
        mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + mapJoinAlias +
          ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator =
        OperatorUtils.findSingleOperator(
            mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
      throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() +
          " operator at the last operator of the MapJoin Task.");
    }


    // The mapJoinTaskFileSinkOperator writes to a different directory
    String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName().toString();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
      return;
    }
    String childMRAlias = childMRAliases.get(0);


    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<String, ArrayList<String>> entry : childMapWork.getPathToAliases().entrySet()) {
      String path = entry.getKey();
      List<String> aliases = entry.getValue();


      if (path.equals(childMRPath)) {
        continue;
      }


      if (aliases.contains(mapJoinAlias)) {
        // alias confict should not happen here.
        return;
      }
    }


    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();


    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
        (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
      // Right now, we do not handle the case that either of them is bucketed.
      // We should relax this constraint with a follow-up jira.
      return;
    }


    // We need to check if the total size of local tables is under the limit.
    // At here, we are using a strong condition, which is the total size of
    // local tables used by all input paths. Actually, we can relax this condition
    // to check the total size of local tables for every input path.
    // Example:
    //               UNION_ALL
    //              /         \
    //             /           \
    //            /             \
    //           /               \
    //       MapJoin1          MapJoin2
    //      /   |   \         /   |   \
    //     /    |    \       /    |    \
    //   Big1   S1   S2    Big2   S3   S4
    // In this case, we have two MapJoins, MapJoin1 and MapJoin2. Big1 and Big2 are two
    // big tables, and S1, S2, S3, and S4 are four small tables. Hash tables of S1 and S2
    // will only be used by Map tasks processing Big1. Hash tables of S3 and S4 will only
    // be used by Map tasks processing Big2. If Big1!=Big2, we should only check if the size
    // of S1 + S2 is under the limit, and if the size of S3 + S4 is under the limit.
    // But, right now, we are checking the size of S1 + S2 + S3 + S4 is under the limit.
    // If Big1=Big2, we will only scan a path once. So, MapJoin1 and MapJoin2 will be executed
    // in the same Map task. In this case, we need to make sure the size of S1 + S2 + S3 + S4
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){
      // The total size of local tables may not be under
      // the limit after we merge mapJoinLocalWork and childLocalWork.
      // Do not merge.
      return;
    }


    TableScanOperator childMRTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            childMapWork.getAliasToWork().get(childMRAlias), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + childMRAlias +
          ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }


    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask =
        mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask =
        childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
      // Do not merge if we do not know how to connect two operator trees.
      return;
    }


    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);


    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias, mapJoinMapWork, childMapWork);


    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
      if (childLocalWork == null) {
        childMapWork.setMapRedLocalWork(mapJoinLocalWork);
      } else {
        childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
        childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
      }
    }


    // Step 2.4: Remove this MapJoin task
    List<Task<? extends Serializable>> parentTasks = mapJoinTask.getParentTasks();
    mapJoinTask.setParentTasks(null);
    mapJoinTask.setChildTasks(null);
    childMapRedTask.getParentTasks().remove(mapJoinTask);
    if (parentTasks != null) {
      childMapRedTask.getParentTasks().addAll(parentTasks);
      for (Task<? extends Serializable> parentTask : parentTasks) {
        parentTask.getChildTasks().remove(mapJoinTask);
        if (!parentTask.getChildTasks().contains(childMapRedTask)) {
          parentTask.getChildTasks().add(childMapRedTask);
        }
      }
    } else {
      if (physicalContext.getRootTasks().contains(mapJoinTask)) {
        physicalContext.removeFromRootTask(mapJoinTask);
        if (childMapRedTask.getParentTasks() != null &&
            childMapRedTask.getParentTasks().size() == 0 &&
            !physicalContext.getRootTasks().contains(childMapRedTask)) {
          physicalContext.addToRootTask(childMapRedTask);
        }
      }
    }
    if (childMapRedTask.getParentTasks().size() == 0) {
      childMapRedTask.setParentTasks(null);
    }
  }

View Full Code Here

      currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
      currWork.setJoinTree(joinTree);


      if (bigTablePosition >= 0) {
        // create map join task and set big table as bigTablePosition
        MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);


        newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
        newTask.setFetchSource(currTask.isFetchSource());
        replaceTask(currTask, newTask, physicalContext);


        // Can this task be merged with the child task. This can happen if a big table is being
        // joined with multiple small tables on different keys
        if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
          mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
        }


        return newTask;
      }


      long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
          HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
      for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
        // this table cannot be big table
        if (!bigTableCandidates.contains(pos)) {
          continue;
        }
        // deep copy a new mapred work from xml
        // Once HIVE-4396 is in, it would be faster to use a cheaper method to clone the plan
        MapredWork newWork = Utilities.clonePlan(currTask.getWork());


        // create map join task and set big table as i
        MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);


        Operator<?> startOp = joinOp.getParentOperators().get(pos);
        Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);


        long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
        if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
          continue;
        }


        // add into conditional task
        listWorks.add(newTask.getWork());
        listTasks.add(newTask);
        newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
        newTask.setFetchSource(currTask.isFetchSource());


        // set up backup task
        newTask.setBackupTask(currTask);
        newTask.setBackupChildrenTasks(currTask.getChildTasks());


        // put the mapping task to aliases
        taskToAliases.put(newTask, aliases);
      }
    } catch (Exception e) {

View Full Code Here

    rWork.setReducer(op5);
  }


  private void executePlan() throws Exception {
    String testName = new Exception().getStackTrace()[1].getMethodName();
    MapRedTask mrtask = new MapRedTask();
    DriverContext dctx = new DriverContext ();
    mrtask.setWork(mr);
    mrtask.initialize(conf, null, dctx);
    int exitVal =  mrtask.execute(dctx);


    if (exitVal != 0) {
      LOG.error(testName + " execution failed with exit status: "
          + exitVal);
      assertEquals(true, false);

View Full Code Here

0 1 2 3

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.mr.MapRedTask

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.common.io.CachingPrintStream

org.apache.hadoop.hive.ql.Context

org.apache.hadoop.hive.ql.DriverContext

org.apache.hadoop.hive.ql.exec.TestExecDriver

org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter

org.apache.hadoop.hive.ql.io.TestSymlinkTextInputFormat

org.apache.hadoop.hive.ql.optimizer.GenMRTableScan1

org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinTaskDispatcher

org.apache.hadoop.hive.ql.optimizer.physical.CrossProductCheck

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.