Examples of org.apache.hadoop.hive.ql.plan.MapWork

org.apache.hadoop.hive.ql.plan.MapWork
MapWork represents all the information used to run a map task on the cluster. It is first used when the query planner breaks the logical plan into tasks and used throughout physical optimization to track map-side operator plans, input paths, aliases, etc. ExecDriver will serialize the contents of this class and make sure it is distributed on the cluster. The ExecMapper will ultimately deserialize this class on the data nodes and setup it's operator pipeline accordingly. This class is also used in the explain command any property with the appropriate annotation will be displayed in the explain output.

  }


  public MapWork createMapWork(GenTezProcContext context, Operator<?> root,
      TezWork tezWork, PrunedPartitionList partitions) throws SemanticException {
    assert root.getParentOperators().isEmpty();
    MapWork mapWork = new MapWork("Map "+ (++sequenceNumber));
    LOG.debug("Adding map work (" + mapWork.getName() + ") for " + root);


    // map work starts with table scan operators
    assert root instanceof TableScanOperator;
    String alias = ((TableScanOperator)root).getConf().getAlias();

View Full Code Here

    for (Task<?> task : pctx.getRootTasks()) {
      if (!(task instanceof MapRedTask) || !((MapRedTask)task).getWork().isFinalMapRed()) {
        continue; // this could be replaced by bucketing on RS + bucketed fetcher for next MR
      }
      MapredWork mrWork = ((MapRedTask) task).getWork();
      MapWork mapWork = mrWork.getMapWork();
      ReduceWork reduceWork = mrWork.getReduceWork();


      if (reduceWork == null || reduceWork.getNumReduceTasks() != 1
          || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0
          || reduceWork.getReducer() == null) {
        continue;
      }
      Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
      if (!(operator instanceof TableScanOperator)) {
        continue;
      }
      ReduceSinkOperator child =
          OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
      if (child == null ||
          child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
        continue;
      }
      child.getConf().setNumReducers(-1);
      reduceWork.setNumReduceTasks(-1);
      mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
    }
    return pctx;
  }

View Full Code Here

        String.format("Warning: %s", msg));
  }


  private void checkMapJoins(MapRedTask mrTsk) throws SemanticException {
    MapredWork mrWrk = mrTsk.getWork();
    MapWork mapWork = mrWrk.getMapWork();
    List<String> warnings = new MapJoinCheck(mrTsk.toString()).analyze(mapWork);
    if (!warnings.isEmpty()) {
      for (String w : warnings) {
        warn(w);
      }

View Full Code Here

      // Nothing to do if it is not a MapReduce task.
      return;
    }


    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();


    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork =
        mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
      // Do not merge if the MapredWork of MapJoin has multiple input aliases.
      return;
    }


    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry =
        mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + mapJoinAlias +
          ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator =
        OperatorUtils.findSingleOperator(
            mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
      throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() +
          " operator at the last operator of the MapJoin Task.");
    }


    // The mapJoinTaskFileSinkOperator writes to a different directory
    String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName().toString();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
      return;
    }
    String childMRAlias = childMRAliases.get(0);


    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<String, ArrayList<String>> entry : childMapWork.getPathToAliases().entrySet()) {
      String path = entry.getKey();
      List<String> aliases = entry.getValue();


      if (path.equals(childMRPath)) {
        continue;
      }


      if (aliases.contains(mapJoinAlias)) {
        // alias confict should not happen here.
        return;
      }
    }


    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapLocalWork();


    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
        (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
      // Right now, we do not handle the case that either of them is bucketed.
      // We should relax this constraint with a follow-up jira.
      return;
    }


    // We need to check if the total size of local tables is under the limit.
    // At here, we are using a strong condition, which is the total size of
    // local tables used by all input paths. Actually, we can relax this condition
    // to check the total size of local tables for every input path.
    // Example:
    //               UNION_ALL
    //              /         \
    //             /           \
    //            /             \
    //           /               \
    //       MapJoin1          MapJoin2
    //      /   |   \         /   |   \
    //     /    |    \       /    |    \
    //   Big1   S1   S2    Big2   S3   S4
    // In this case, we have two MapJoins, MapJoin1 and MapJoin2. Big1 and Big2 are two
    // big tables, and S1, S2, S3, and S4 are four small tables. Hash tables of S1 and S2
    // will only be used by Map tasks processing Big1. Hash tables of S3 and S4 will only
    // be used by Map tasks processing Big2. If Big1!=Big2, we should only check if the size
    // of S1 + S2 is under the limit, and if the size of S3 + S4 is under the limit.
    // But, right now, we are checking the size of S1 + S2 + S3 + S4 is under the limit.
    // If Big1=Big2, we will only scan a path once. So, MapJoin1 and MapJoin2 will be executed
    // in the same Map task. In this case, we need to make sure the size of S1 + S2 + S3 + S4
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){
      // The total size of local tables may not be under
      // the limit after we merge mapJoinLocalWork and childLocalWork.
      // Do not merge.
      return;
    }


    TableScanOperator childMRTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            childMapWork.getAliasToWork().get(childMRAlias), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + childMRAlias +
          ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }


    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask =
        mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask =
        childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
      // Do not merge if we do not know how to connect two operator trees.
      return;
    }


    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);


    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias, mapJoinMapWork, childMapWork);


    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
      if (childLocalWork == null) {
        childMapWork.setMapLocalWork(mapJoinLocalWork);
      } else {
        childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
        childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
      }
    }

View Full Code Here

    if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
      return null;
    }
    currTask.setTaskTag(Task.COMMON_JOIN);


    MapWork currWork = currTask.getWork().getMapWork();


    // create conditional work list and task list
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();


    // create task to aliases mapping and alias to input file mapping for resolver
    HashMap<Task<? extends Serializable>, Set<String>> taskToAliases =
        new HashMap<Task<? extends Serializable>, Set<String>>();
    HashMap<String, ArrayList<String>> pathToAliases = currWork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();


    // get parseCtx for this Join Operator
    ParseContext parseCtx = physicalContext.getParseContext();
    QBJoinTree joinTree = parseCtx.getJoinContext().get(joinOp);


    // start to generate multiple map join tasks
    JoinDesc joinDesc = joinOp.getConf();


    if (aliasToSize == null) {
      aliasToSize = new HashMap<String, Long>();
    }


    try {
      long aliasTotalKnownInputSize =
          getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);


      Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc
          .getConds());


      // no table could be the big table; there is no need to convert
      if (bigTableCandidates.isEmpty()) {
        return null;
      }


      // if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
      // only contain multi-sourced because multi-sourced cannot be hashed or direct readable
      bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);


      Configuration conf = context.getConf();


      // If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
      // the threshold size, convert the join into map-join and don't create a conditional task
      boolean convertJoinMapJoin = HiveConf.getBoolVar(conf,
          HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
      int bigTablePosition = -1;
      if (convertJoinMapJoin) {
        // This is the threshold that the user has specified to fit in mapjoin
        long mapJoinSize = HiveConf.getLongVar(conf,
            HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);


        Long bigTableSize = null;
        Set<String> aliases = aliasToWork.keySet();
        for (int tablePosition : bigTableCandidates) {
          Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
          Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
          long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
          if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
            continue; // some small alias is not known or too big
          }
          if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
            continue; // prefer right most alias
          }
          long aliasSize = Utilities.sumOf(aliasToSize, participants);
          if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
            bigTablePosition = tablePosition;
            bigTableSize = aliasSize;
          }
        }
      }


      currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
      currWork.setJoinTree(joinTree);


      if (bigTablePosition >= 0) {
        // create map join task and set big table as bigTablePosition
        MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);


        newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
        replaceTask(currTask, newTask, physicalContext);


        // Can this task be merged with the child task. This can happen if a big table is being
        // joined with multiple small tables on different keys
        if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
          mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
        }


        return newTask;
      }


      long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
          HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
      for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
        // this table cannot be big table
        if (!bigTableCandidates.contains(pos)) {
          continue;
        }
        // deep copy a new mapred work from xml
        // Once HIVE-4396 is in, it would be faster to use a cheaper method to clone the plan
        MapredWork newWork = Utilities.clonePlan(currTask.getWork());


        // create map join task and set big table as i
        MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);


        Operator<?> startOp = joinOp.getParentOperators().get(pos);
        Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);


        long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
        if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
          continue;
        }


        // add into conditional task
        listWorks.add(newTask.getWork());
        listTasks.add(newTask);
        newTask.setTaskTag(Task.CONVERTED_MAPJOIN);


        // set up backup task
        newTask.setBackupTask(currTask);
        newTask.setBackupChildrenTasks(currTask.getChildTasks());


        // put the mapping task to aliases
        taskToAliases.put(newTask, aliases);
      }
    } catch (Exception e) {
      e.printStackTrace();
      throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
    }


    // insert current common join task to conditional task
    listWorks.add(currTask.getWork());
    listTasks.add(currTask);
    // clear JoinTree and OP Parse Context
    currWork.setOpParseCtxMap(null);
    currWork.setJoinTree(null);


    // create conditional task and insert conditional task into task tree
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
    cndTsk.setListTasks(listTasks);

View Full Code Here


    return true;
  }


  private JoinOperator getJoinOp(MapRedTask task) throws SemanticException {
    MapWork mWork = task.getWork().getMapWork();
    ReduceWork rWork = task.getWork().getReduceWork();
    if (rWork == null) {
      return null;
    }
    Operator<? extends OperatorDesc> reducerOp = rWork.getReducer();
    if (reducerOp instanceof JoinOperator) {
      /* Is any operator present, which prevents the conversion */
      Map<String, Operator<? extends OperatorDesc>> aliasToWork = mWork.getAliasToWork();
      for (Operator<? extends OperatorDesc> op : aliasToWork.values()) {
        if (!checkOperatorOKMapJoinConversion(op)) {
          return null;
        }
      }

View Full Code Here

      // modify the parse context to use indexing
      // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
      HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);


      // prepare the map reduce job to use indexing
      MapWork work = currentTask.getWork().getMapWork();
      work.setInputformat(queryContext.getIndexInputFormat());
      work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
      // modify inputs based on index query
      Set<ReadEntity> inputs = pctx.getSemanticInputs();
      inputs.addAll(queryContext.getAdditionalSemanticInputs());
      List<Task<?>> chosenRewrite = queryContext.getQueryTasks();

View Full Code Here

    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);


    for (int i = 0; i < numAliases - 1; i++) {
      Byte src = tags[i];
      MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();


      // This code has been only added for testing
      boolean mapperCannotSpanPartns =
        parseCtx.getConf().getBoolVar(
          HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
      newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);


      MapredWork clonePlan = Utilities.clonePlan(currPlan);


      Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
      for (int k = 0; k < tags.length; k++) {
        Operator<? extends OperatorDesc> ts =
            GenMapRedUtils.createTemporaryTableScanOperator(rowSchemaList.get((byte)k));
        ((TableScanOperator)ts).setTableDesc(tableDescList.get((byte)k));
        parentOps[k] = ts;
      }
      Operator<? extends OperatorDesc> tblScan_op = parentOps[i];


      ArrayList<String> aliases = new ArrayList<String>();
      String alias = src.toString();
      aliases.add(alias);
      Path bigKeyDirPath = bigKeysDirMap.get(src);
      newPlan.getPathToAliases().put(bigKeyDirPath.toString(), aliases);


      newPlan.getAliasToWork().put(alias, tblScan_op);
      PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);


      newPlan.getPathToPartitionInfo().put(bigKeyDirPath.toString(), part);
      newPlan.getAliasToPartnInfo().put(alias, part);


      Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
      assert reducer instanceof JoinOperator;
      JoinOperator cloneJoinOp = (JoinOperator) reducer;


      String dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
      MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc,
          newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc,joinDescriptor
          .getOutputColumnNames(), i, joinDescriptor.getConds(),
          joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
      mapJoinDescriptor.setTagOrder(tags);
      mapJoinDescriptor.setHandleSkewJoin(false);
      mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());


      MapredLocalWork localPlan = new MapredLocalWork(
          new LinkedHashMap<String, Operator<? extends OperatorDesc>>(),
          new LinkedHashMap<String, FetchWork>());
      Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);


      for (int j = 0; j < numAliases; j++) {
        if (j == i) {
          continue;
        }
        Byte small_alias = tags[j];
        Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
        localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
        Path tblDir = smallTblDirs.get(small_alias);
        localPlan.getAliasToFetchWork().put(small_alias.toString(),
            new FetchWork(tblDir, tableDescList.get(small_alias)));
      }


      newPlan.setMapLocalWork(localPlan);


      // construct a map join and set it as the child operator of tblScan_op
      MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory
          .getAndMakeChild(mapJoinDescriptor, (RowSchema) null, parentOps);
      // change the children of the original join operator to point to the map
      // join operator
      List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp
          .getChildOperators();
      for (Operator<? extends OperatorDesc> childOp : childOps) {
        childOp.replaceParent(cloneJoinOp, mapJoinOp);
      }
      mapJoinOp.setChildOperators(childOps);


      HiveConf jc = new HiveConf(parseCtx.getConf(),
          GenMRSkewJoinProcessor.class);


      newPlan.setNumMapTasks(HiveConf
          .getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
      newPlan
          .setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
      newPlan.setInputformat(HiveInputFormat.class.getName());


      MapredWork w = new MapredWork();
      w.setMapWork(newPlan);


      Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w, jc);

View Full Code Here


        // create map join task for the given big table position
        MapRedTask newTask = convertSMBTaskToMapJoinTask(
            currJoinWork, bigTablePosition, newSMBJoinOp, joinTree);


        MapWork mapWork = newTask.getWork().getMapWork();
        Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
        Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);


        long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
        if (aliasKnownSize > 0) {

View Full Code Here

   */
  private Schema getSchema(JobConf job, FileSplit split) throws AvroSerdeException, IOException {
    FileSystem fs = split.getPath().getFileSystem(job);
    // Inside of a MR job, we can pull out the actual properties
    if(AvroSerdeUtils.insideMRJob(job)) {
      MapWork mapWork = Utilities.getMapWork(job);


      // Iterate over the Path -> Partition descriptions to find the partition
      // that matches our input split.
      for (Map.Entry<String,PartitionDesc> pathsAndParts: mapWork.getPathToPartitionInfo().entrySet()){
        String partitionPath = pathsAndParts.getKey();
        if(pathIsInPartition(split.getPath(), partitionPath)) {
          if(LOG.isInfoEnabled()) {
              LOG.info("Matching partition " + partitionPath +
                      " with input split " + split);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.MapWork

com.linkedin.haivvreo.AvroGenericRecordReader

com.linkedin.haivvreo.AvroSerDe

org.apache.hadoop.hive.ql.DriverContext

org.apache.hadoop.hive.ql.exec.MoveTask

org.apache.hadoop.hive.ql.exec.mr.ExecDriver

org.apache.hadoop.hive.ql.exec.mr.ExecMapper

org.apache.hadoop.hive.ql.exec.tez.DagUtils

org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator

org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor

org.apache.hadoop.hive.ql.exec.tez.MergeFileRecordProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.