Package org.apache.hadoop.hive.ql.exec.mr

Examples of org.apache.hadoop.hive.ql.exec.mr.MapRedTask


  public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs)
      throws SemanticException {
    @SuppressWarnings("unchecked")
    Task<? extends Serializable> currTask = (Task<? extends Serializable>) nd;
    if (currTask instanceof MapRedTask) {
      MapRedTask mrTsk = (MapRedTask)currTask;
      MapredWork mrWrk = mrTsk.getWork();
      checkMapJoins(mrTsk);
      checkMRReducer(currTask.toString(), mrWrk);
    } else if (currTask instanceof ConditionalTask ) {
      List<Task<? extends Serializable>> taskListInConditionalTask =
          ((ConditionalTask) currTask).getListTasks();
View Full Code Here


    // we can only process MapReduce tasks to check input size
    if (!context.getCurrentTask().isMapRedTask()) {
      return null;
    }
    MapRedTask currentTask = (MapRedTask) context.getCurrentTask();

    // get potential reentrant index queries from each index
    Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
    // make sure we have an index on the table being scanned
    TableDesc tblDesc = operator.getTableDesc();

    Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
    for (Index indexOnTable : indexes) {
      if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
        List<Index> newType = new ArrayList<Index>();
        newType.add(indexOnTable);
        indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
      } else {
        indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
      }
    }

    // choose index type with most tsToIndices of the same type on the table
    // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
    List<Index> bestIndexes = indexesByType.values().iterator().next();
    for (List<Index> indexTypes : indexesByType.values()) {
      if (bestIndexes.size() < indexTypes.size()) {
        bestIndexes = indexTypes;
      }
    }

    // rewrite index queries for the chosen index type
    HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
    tmpQueryContext.setQueryPartitions(queryPartitions);
    rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
    List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();

    if (indexTasks != null && indexTasks.size() > 0) {
      queryContexts.put(bestIndexes.get(0), tmpQueryContext);
    }
    // choose an index rewrite to use
    if (queryContexts.size() > 0) {
      // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
      Index chosenIndex = queryContexts.keySet().iterator().next();

      // modify the parse context to use indexing
      // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
      HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);

      // prepare the map reduce job to use indexing
      MapWork work = currentTask.getWork().getMapWork();
      work.setInputformat(queryContext.getIndexInputFormat());
      work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
      // modify inputs based on index query
      Set<ReadEntity> inputs = pctx.getSemanticInputs();
      inputs.addAll(queryContext.getAdditionalSemanticInputs());
View Full Code Here

      Path emptyScratchDir = ctx.getMRTmpPath();
      FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
      fileSys.mkdirs(emptyScratchDir);

      QueryPlan plan = drv.getPlan();
      MapRedTask selectTask = (MapRedTask)plan.getRootTasks().get(0);

      List<Path> inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx, false);
      Utilities.setInputPaths(newJob, inputPaths);

      Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath());

      CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(
          CombineHiveInputFormat.class, newJob);

      InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
View Full Code Here

  public void finished(TaskRunner runner) {
    if (statsTasks.isEmpty() || !(runner.getTask() instanceof MapRedTask)) {
      return;
    }
    MapRedTask mapredTask = (MapRedTask) runner.getTask();

    MapWork mapWork = mapredTask.getWork().getMapWork();
    ReduceWork reduceWork = mapredTask.getWork().getReduceWork();
    List<Operator> operators = new ArrayList<Operator>(mapWork.getAliasToWork().values());
    if (reduceWork != null) {
      operators.add(reduceWork.getReducer());
    }
    final List<String> statKeys = new ArrayList<String>(1);
View Full Code Here

  public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs)
      throws SemanticException {
    @SuppressWarnings("unchecked")
    Task<? extends Serializable> currTask = (Task<? extends Serializable>) nd;
    if (currTask instanceof MapRedTask) {
      MapRedTask mrTsk = (MapRedTask)currTask;
      MapredWork mrWrk = mrTsk.getWork();
      checkMapJoins(mrTsk);
      checkMRReducer(currTask.toString(), mrWrk);
    } else if (currTask instanceof ConditionalTask ) {
      List<Task<? extends Serializable>> taskListInConditionalTask =
          ((ConditionalTask) currTask).getListTasks();
View Full Code Here

    // we can only process MapReduce tasks to check input size
    if (!context.getCurrentTask().isMapRedTask()) {
      return null;
    }
    MapRedTask currentTask = (MapRedTask) context.getCurrentTask();

    // get potential reentrant index queries from each index
    Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
    // make sure we have an index on the table being scanned
    TableDesc tblDesc = operator.getTableDesc();
    Table srcTable = pctx.getTopToTable().get(operator);
    if (indexes == null || indexes.get(srcTable) == null) {
      return null;
    }

    List<Index> tableIndexes = indexes.get(srcTable);
    Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
    for (Index indexOnTable : tableIndexes) {
      if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
        List<Index> newType = new ArrayList<Index>();
        newType.add(indexOnTable);
        indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
      } else {
        indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
      }
    }

    // choose index type with most indexes of the same type on the table
    // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
    List<Index> bestIndexes = indexesByType.values().iterator().next();
    for (List<Index> indexTypes : indexesByType.values()) {
      if (bestIndexes.size() < indexTypes.size()) {
        bestIndexes = indexTypes;
      }
    }

    // rewrite index queries for the chosen index type
    HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
    tmpQueryContext.setQueryPartitions(queryPartitions);
    rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
    List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();

    if (indexTasks != null && indexTasks.size() > 0) {
      queryContexts.put(bestIndexes.get(0), tmpQueryContext);
    }
    // choose an index rewrite to use
    if (queryContexts.size() > 0) {
      // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
      Index chosenIndex = queryContexts.keySet().iterator().next();

      // modify the parse context to use indexing
      // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
      HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);

      // prepare the map reduce job to use indexing
      MapWork work = currentTask.getWork().getMapWork();
      work.setInputformat(queryContext.getIndexInputFormat());
      work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
      // modify inputs based on index query
      Set<ReadEntity> inputs = pctx.getSemanticInputs();
      inputs.addAll(queryContext.getAdditionalSemanticInputs());
View Full Code Here

  // create map join task and set big table as bigTablePosition
  private ObjectPair<MapRedTask, String> convertTaskToMapJoinTask(MapredWork newWork,
      int bigTablePosition) throws UnsupportedEncodingException, SemanticException {
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
        .getParseContext().getConf());
    JoinOperator newJoinOp = getJoinOp(newTask);

    // optimize this newWork given the big table position
    String bigTableAlias =
View Full Code Here

    if (!(childTask instanceof MapRedTask)) {
      // Nothing to do if it is not a MapReduce task.
      return;
    }

    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();

    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork =
        mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
      // Do not merge if the MapredWork of MapJoin has multiple input aliases.
      return;
    }

    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry =
        mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + mapJoinAlias +
          ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator =
        OperatorUtils.findSingleOperator(
            mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
      throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() +
          " operator at the last operator of the MapJoin Task.");
    }

    // The mapJoinTaskFileSinkOperator writes to a different directory
    String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
      return;
    }
    String childMRAlias = childMRAliases.get(0);

    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapLocalWork();

    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
        (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
      // Right now, we do not handle the case that either of them is bucketed.
      // We should relax this constraint with a follow-up jira.
      return;
    }

    // We need to check if the total size of local tables is under the limit.
    // At here, we are using a strong condition, which is the total size of
    // local tables used by all input paths. Actually, we can relax this condition
    // to check the total size of local tables for every input path.
    // Example:
    //               UNION_ALL
    //              /         \
    //             /           \
    //            /             \
    //           /               \
    //       MapJoin1          MapJoin2
    //      /   |   \         /   |   \
    //     /    |    \       /    |    \
    //   Big1   S1   S2    Big2   S3   S4
    // In this case, we have two MapJoins, MapJoin1 and MapJoin2. Big1 and Big2 are two
    // big tables, and S1, S2, S3, and S4 are four small tables. Hash tables of S1 and S2
    // will only be used by Map tasks processing Big1. Hash tables of S3 and S4 will only
    // be used by Map tasks processing Big2. If Big1!=Big2, we should only check if the size
    // of S1 + S2 is under the limit, and if the size of S3 + S4 is under the limit.
    // But, right now, we are checking the size of S1 + S2 + S3 + S4 is under the limit.
    // If Big1=Big2, we will only scan a path once. So, MapJoin1 and MapJoin2 will be executed
    // in the same Map task. In this case, we need to make sure the size of S1 + S2 + S3 + S4
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){
      // The total size of local tables may not be under
      // the limit after we merge mapJoinLocalWork and childLocalWork.
      // Do not merge.
      return;
    }

    TableScanOperator childMRTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            childMapWork.getAliasToWork().get(childMRAlias), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + childMRAlias +
          ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }

    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask =
        mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask =
        childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
      // Do not merge if we do not know how to connect two operator trees.
      return;
    }

    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);

    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias, mapJoinMapWork, childMapWork);

    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
      if (childLocalWork == null) {
        childMapWork.setMapLocalWork(mapJoinLocalWork);
      } else {
        childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
        childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
      }
    }

    // Step 2.4: Remove this MapJoin task
    List<Task<? extends Serializable>> parentTasks = mapJoinTask.getParentTasks();
    mapJoinTask.setParentTasks(null);
    mapJoinTask.setChildTasks(null);
    childMapRedTask.getParentTasks().remove(mapJoinTask);
    if (parentTasks != null) {
      childMapRedTask.getParentTasks().addAll(parentTasks);
      for (Task<? extends Serializable> parentTask : parentTasks) {
        parentTask.getChildTasks().remove(mapJoinTask);
        if (!parentTask.getChildTasks().contains(childMapRedTask)) {
          parentTask.getChildTasks().add(childMapRedTask);
        }
      }
    } else {
      if (physicalContext.getRootTasks().contains(mapJoinTask)) {
        physicalContext.removeFromRootTask(mapJoinTask);
        if (childMapRedTask.getParentTasks() != null &&
            childMapRedTask.getParentTasks().size() == 0 &&
            !physicalContext.getRootTasks().contains(childMapRedTask)) {
          physicalContext.addToRootTask(childMapRedTask);
        }
      }
    }
    if (childMapRedTask.getParentTasks().size() == 0) {
      childMapRedTask.setParentTasks(null);
    }
  }
View Full Code Here

      QBJoinTree joinTree)
      throws UnsupportedEncodingException, SemanticException {
    // deep copy a new mapred work
    MapredWork newWork = Utilities.clonePlan(origWork);
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
        .getParseContext().getConf());
    // generate the map join operator; already checked the map join
    MapJoinOperator newMapJoinOp =
        getMapJoinOperator(newTask, newWork, smbJoinOp, joinTree, bigTablePosition);
View Full Code Here

        }

        // create map join task for the given big table position
        ObjectPair<MapRedTask, String> newTaskAlias = convertSMBTaskToMapJoinTask(
            currJoinWork, bigTablePosition, newSMBJoinOp, joinTree);
        MapRedTask newTask = newTaskAlias.getFirst();
        String bigTableAlias = newTaskAlias.getSecond();

        Long aliasKnownSize = aliasToSize.get(bigTableAlias);
        if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) {
          long smallTblTotalKnownSize = aliasTotalKnownInputSize
              - aliasKnownSize.longValue();
          if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
            // this table is not good to be a big table.
            continue;
          }
        }

        // add into conditional task
        listWorks.add(newTask.getWork());
        listTasks.add(newTask);
        newTask.setTaskTag(Task.CONVERTED_MAPJOIN);

        // set up backup task
        newTask.setBackupTask(currTask);
        newTask.setBackupChildrenTasks(currTask.getChildTasks());

        // put the mapping alias to task
        aliasToTask.put(bigTableAlias, newTask);
      }
    } catch (Exception e) {
View Full Code Here

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.mr.MapRedTask

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.