Examples of org.apache.hadoop.hive.ql.plan.MapWork

org.apache.hadoop.hive.ql.plan.MapWork
MapWork represents all the information used to run a map task on the cluster. It is first used when the query planner breaks the logical plan into tasks and used throughout physical optimization to track map-side operator plans, input paths, aliases, etc. ExecDriver will serialize the contents of this class and make sure it is distributed on the cluster. The ExecMapper will ultimately deserialize this class on the data nodes and setup it's operator pipeline accordingly. This class is also used in the explain command any property with the appropriate annotation will be displayed in the explain output.

  }


  // loop over all the tasks recursively
  private void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof ExecDriver) {
      MapWork work = ((MapredWork) task.getWork()).getMapWork();
      HashMap<String, Operator<? extends OperatorDesc>> opMap = work.getAliasToWork();
      if (!opMap.isEmpty()) {
        for (Operator<? extends OperatorDesc> op : opMap.values()) {
          setInputFormat(work, op);
        }
      }

View Full Code Here

      Iterator<TableScanOperator> iterator
        = walkerCtx.getMetadataOnlyTableScans().iterator();


      while (iterator.hasNext()) {
        TableScanOperator tso = iterator.next();
        MapWork work = ((MapredWork) task.getWork()).getMapWork();
        String alias = getAliasForTableScanOperator(work, tso);
        LOG.info("Metadata only table scan for " + alias);
        processAlias(work, alias);
      }

View Full Code Here

    //
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    //
    MoveWork dummyMv = new MoveWork(null, null, null,
        new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null), false);
    MapWork cplan;
    Serializable work;


    if (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) &&
        fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) {


      // Check if InputFormatClass is valid
      String inputFormatClass = conf.getVar(ConfVars.HIVEMERGEINPUTFORMATBLOCKLEVEL);
      try {
        Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass);


        LOG.info("RCFile format- Using block level merge");
        cplan = createRCFileMergeTask(fsInputDesc, finalName,
            dpCtx != null && dpCtx.getNumDPCols() > 0);
        work = cplan;
      } catch (ClassNotFoundException e) {
        String msg = "Illegal input format class: " + inputFormatClass;
        throw new SemanticException(msg);
      }


    } else {
      cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
      work = new MapredWork();
      ((MapredWork)work).setMapWork(cplan);
      // use CombineHiveInputFormat for map-only merging
    }
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    ConditionalTask cndTsk = createCondTask(conf, ctx.getCurrTask(), dummyMv, work,
        fsInputDesc.getFinalDirName());

View Full Code Here

    TableDesc tblDesc = fsDesc.getTableInfo();
    aliases.add(inputDir); // dummy alias: just use the input path


    // constructing the default MapredWork
    MapredWork cMrPlan = GenMapRedUtils.getMapRedWorkFromConf(conf);
    MapWork cplan = cMrPlan.getMapWork();
    cplan.getPathToAliases().put(inputDir, aliases);
    cplan.getPathToPartitionInfo().put(inputDir, new PartitionDesc(tblDesc, null));
    cplan.getAliasToWork().put(inputDir, topOp);
    cplan.setMapperCannotSpanPartns(true);


    return cplan;
  }

View Full Code Here


        MapredWork plan = (MapredWork) currTask.getWork();
        for (int pos = 0; pos < size; pos++) {
          String taskTmpDir = taskTmpDirLst.get(pos);
          TableDesc tt_desc = tt_descLst.get(pos);
          MapWork mWork = plan.getMapWork();
          if (mWork.getPathToAliases().get(taskTmpDir) == null) {
            mWork.getPathToAliases().put(taskTmpDir,
                new ArrayList<String>());
            mWork.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
            mWork.getPathToPartitionInfo().put(taskTmpDir,
                new PartitionDesc(tt_desc, null));
            mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
          }
        }
      }
    }
  }

View Full Code Here

   *          pruned partition list. If it is null it will be computed on-the-fly.
   */
  public static void setTaskPlan(String alias_id,
      Operator<? extends OperatorDesc> topOp, Task<?> task, boolean local,
      GenMRProcContext opProcCtx, PrunedPartitionList pList) throws SemanticException {
    MapWork plan = ((MapredWork) task.getWork()).getMapWork();
    ParseContext parseCtx = opProcCtx.getParseCtx();
    Set<ReadEntity> inputs = opProcCtx.getInputs();


    ArrayList<Path> partDir = new ArrayList<Path>();
    ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();


    Path tblDir = null;
    TableDesc tblDesc = null;


    PrunedPartitionList partsList = pList;


    plan.setNameToSplitSample(parseCtx.getNameToSplitSample());


    if (partsList == null) {
      try {
        TableScanOperator tsOp = (TableScanOperator) topOp;
        partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
      } catch (SemanticException e) {
        throw e;
      } catch (HiveException e) {
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
      }
    }


    // Generate the map work for this alias_id
    // pass both confirmed and unknown partitions through the map-reduce
    // framework
    Set<Partition> parts = partsList.getPartitions();
    PartitionDesc aliasPartnDesc = null;
    try {
      if (!parts.isEmpty()) {
        aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next());
      }
    } catch (HiveException e) {
      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
      throw new SemanticException(e.getMessage(), e);
    }


    // The table does not have any partitions
    if (aliasPartnDesc == null) {
      aliasPartnDesc = new PartitionDesc(Utilities.getTableDesc(parseCtx
          .getTopToTable().get(topOp)), null);


    }


    Map<String, String> props = parseCtx.getTopToProps().get(topOp);
    if (props != null) {
      Properties target = aliasPartnDesc.getProperties();
      if (target == null) {
        aliasPartnDesc.setProperties(target = new Properties());
      }
      target.putAll(props);
    }


    plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);


    long sizeNeeded = Integer.MAX_VALUE;
    int fileLimit = -1;
    if (parseCtx.getGlobalLimitCtx().isEnable()) {
      long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(),
          HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
      sizeNeeded = parseCtx.getGlobalLimitCtx().getGlobalLimit() * sizePerRow;
      // for the optimization that reduce number of input file, we limit number
      // of files allowed. If more than specific number of files have to be
      // selected, we skip this optimization. Since having too many files as
      // inputs can cause unpredictable latency. It's not necessarily to be
      // cheaper.
      fileLimit =
          HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);


      if (sizePerRow <= 0 || fileLimit <= 0) {
        LOG.info("Skip optimization to reduce input size of 'limit'");
        parseCtx.getGlobalLimitCtx().disableOpt();
      } else if (parts.isEmpty()) {
        LOG.info("Empty input: skip limit optimiztion");
      } else {
        LOG.info("Try to reduce input size for 'limit' " +
            "sizeNeeded: " + sizeNeeded +
            "  file limit : " + fileLimit);
      }
    }
    boolean isFirstPart = true;
    boolean emptyInput = true;
    boolean singlePartition = (parts.size() == 1);


    // Track the dependencies for the view. Consider a query like: select * from V;
    // where V is a view of the form: select * from T
    // The dependencies should include V at depth 0, and T at depth 1 (inferred).
    ReadEntity parentViewInfo = getParentViewInfo(alias_id, parseCtx.getViewAliasToInput());


    // The table should also be considered a part of inputs, even if the table is a
    // partitioned table and whether any partition is selected or not
    PlanUtils.addInput(inputs,
        new ReadEntity(parseCtx.getTopToTable().get(topOp), parentViewInfo));


    for (Partition part : parts) {
      if (part.getTable().isPartitioned()) {
        PlanUtils.addInput(inputs, new ReadEntity(part, parentViewInfo));
      } else {
        PlanUtils.addInput(inputs, new ReadEntity(part.getTable(), parentViewInfo));
      }


      // Later the properties have to come from the partition as opposed
      // to from the table in order to support versioning.
      Path[] paths = null;
      sampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(topOp);


      // Lookup list bucketing pruner
      Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(topOp);
      ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName())
          : null;


      if (sampleDescr != null) {
        assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
        paths = SamplePruner.prune(part, sampleDescr);
        parseCtx.getGlobalLimitCtx().disableOpt();
      } else if (listBucketingPruner != null) {
        assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
        /* Use list bucketing prunner's path. */
        paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
      } else {
        // Now we only try the first partition, if the first partition doesn't
        // contain enough size, we change to normal mode.
        if (parseCtx.getGlobalLimitCtx().isEnable()) {
          if (isFirstPart) {
            long sizeLeft = sizeNeeded;
            ArrayList<Path> retPathList = new ArrayList<Path>();
            SamplePruner.LimitPruneRetStatus status = SamplePruner.limitPrune(part, sizeLeft,
                fileLimit, retPathList);
            if (status.equals(SamplePruner.LimitPruneRetStatus.NoFile)) {
              continue;
            } else if (status.equals(SamplePruner.LimitPruneRetStatus.NotQualify)) {
              LOG.info("Use full input -- first " + fileLimit + " files are more than "
                  + sizeNeeded
                  + " bytes");


              parseCtx.getGlobalLimitCtx().disableOpt();


            } else {
              emptyInput = false;
              paths = new Path[retPathList.size()];
              int index = 0;
              for (Path path : retPathList) {
                paths[index++] = path;
              }
              if (status.equals(SamplePruner.LimitPruneRetStatus.NeedAllFiles) && singlePartition) {
                // if all files are needed to meet the size limit, we disable
                // optimization. It usually happens for empty table/partition or
                // table/partition with only one file. By disabling this
                // optimization, we can avoid retrying the query if there is
                // not sufficient rows.
                parseCtx.getGlobalLimitCtx().disableOpt();
              }
            }
            isFirstPart = false;
          } else {
            paths = new Path[0];
          }
        }
        if (!parseCtx.getGlobalLimitCtx().isEnable()) {
          paths = part.getPath();
        }
      }


      // is it a partitioned table ?
      if (!part.getTable().isPartitioned()) {
        assert ((tblDir == null) && (tblDesc == null));


        tblDir = paths[0];
        tblDesc = Utilities.getTableDesc(part.getTable());
      } else if (tblDesc == null) {
        tblDesc = Utilities.getTableDesc(part.getTable());
      }


      if (props != null) {
        Properties target = tblDesc.getProperties();
        if (target == null) {
          tblDesc.setProperties(target = new Properties());
        }
        target.putAll(props);
      }


      for (Path p : paths) {
        if (p == null) {
          continue;
        }
        String path = p.toString();
        if (LOG.isDebugEnabled()) {
          LOG.debug("Adding " + path + " of table" + alias_id);
        }


        partDir.add(p);
        try {
          if (part.getTable().isPartitioned()) {
            partDesc.add(Utilities.getPartitionDesc(part));
          }
          else {
            partDesc.add(Utilities.getPartitionDescFromTableDesc(tblDesc, part));
          }
        } catch (HiveException e) {
          LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
          throw new SemanticException(e.getMessage(), e);
        }
      }
    }
    if (emptyInput) {
      parseCtx.getGlobalLimitCtx().disableOpt();
    }


    Iterator<Path> iterPath = partDir.iterator();
    Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();


    if (!local) {
      while (iterPath.hasNext()) {
        assert iterPartnDesc.hasNext();
        String path = iterPath.next().toString();


        PartitionDesc prtDesc = iterPartnDesc.next();


        // Add the path to alias mapping
        if (plan.getPathToAliases().get(path) == null) {
          plan.getPathToAliases().put(path, new ArrayList<String>());
        }
        plan.getPathToAliases().get(path).add(alias_id);
        plan.getPathToPartitionInfo().put(path, prtDesc);
        if (LOG.isDebugEnabled()) {
          LOG.debug("Information added for path " + path);
        }
      }


      assert plan.getAliasToWork().get(alias_id) == null;
      plan.getAliasToWork().put(alias_id, topOp);
    } else {
      // populate local work if needed
      MapredLocalWork localPlan = plan.getMapLocalWork();
      if (localPlan == null) {
        localPlan = new MapredLocalWork(
            new LinkedHashMap<String, Operator<? extends OperatorDesc>>(),
            new LinkedHashMap<String, FetchWork>());
      }


      assert localPlan.getAliasToWork().get(alias_id) == null;
      assert localPlan.getAliasToFetchWork().get(alias_id) == null;
      localPlan.getAliasToWork().put(alias_id, topOp);
      if (tblDir == null) {
        tblDesc = Utilities.getTableDesc(partsList.getSourceTable());
        localPlan.getAliasToFetchWork().put(
            alias_id,
            new FetchWork(FetchWork.convertPathToStringArray(partDir), partDesc, tblDesc));
      } else {
        localPlan.getAliasToFetchWork().put(alias_id,
            new FetchWork(tblDir.toString(), tblDesc));
      }
      plan.setMapLocalWork(localPlan);
    }
    opProcCtx.addSeenOp(task, topOp);
  }

View Full Code Here

   *
   * @return the new plan
   */
  public static MapredWork getMapRedWorkFromConf(HiveConf conf) {
    MapredWork mrWork = new MapredWork();
    MapWork work = mrWork.getMapWork();


    boolean mapperCannotSpanPartns =
        conf.getBoolVar(
            HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
    work.setMapperCannotSpanPartns(mapperCannotSpanPartns);
    work.setPathToAliases(new LinkedHashMap<String, ArrayList<String>>());
    work.setPathToPartitionInfo(new LinkedHashMap<String, PartitionDesc>());
    work.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>());
    work.setHadoopSupportsSplittable(
        conf.getBoolVar(HiveConf.ConfVars.HIVE_COMBINE_INPUT_FORMAT_SUPPORTS_SPLITTABLE));
    return mrWork;
  }

View Full Code Here

   */
  private Schema getSchema(JobConf job, FileSplit split) throws AvroSerdeException, IOException {
    FileSystem fs = split.getPath().getFileSystem(job);
    // Inside of a MR job, we can pull out the actual properties
    if(AvroSerdeUtils.insideMRJob(job)) {
      MapWork mapWork = Utilities.getMapWork(job);


      // Iterate over the Path -> Partition descriptions to find the partition
      // that matches our input split.
      for (Map.Entry<String,PartitionDesc> pathsAndParts: mapWork.getPathToPartitionInfo().entrySet()){
        String partitionPath = pathsAndParts.getKey();
        if(pathIsInPartition(split.getPath(), partitionPath)) {
          if(LOG.isInfoEnabled()) {
              LOG.info("Matching partition " + partitionPath +
                      " with input split " + split);

View Full Code Here


            // If it's a standard map reduce task, check what, if anything, it inferred about
            // the directory this move task is moving
            if (task instanceof MapRedTask) {
              MapredWork work = (MapredWork)task.getWork();
              MapWork mapWork = work.getMapWork();
              bucketCols = mapWork.getBucketedColsByDirectory().get(path);
              sortCols = mapWork.getSortedColsByDirectory().get(path);
              if (work.getReduceWork() != null) {
                numBuckets = work.getReduceWork().getNumReduceTasks();
              }


              if (bucketCols != null || sortCols != null) {

View Full Code Here

    }
    try {
      jc = job;
      execContext.setJc(jc);
      // create map and fetch operators
      MapWork mrwork = Utilities.getMapWork(job);
      mo = new MapOperator();
      mo.setConf(mrwork);
      // initialize map operator
      mo.setChildren(job);
      l4j.info(mo.dump(0));
      // initialize map local work
      localWork = mrwork.getMapLocalWork();
      execContext.setLocalWork(localWork);


      MapredContext.init(true, new JobConf(jc));


      mo.setExecContext(execContext);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.MapWork

com.linkedin.haivvreo.AvroGenericRecordReader

com.linkedin.haivvreo.AvroSerDe

org.apache.hadoop.hive.ql.DriverContext

org.apache.hadoop.hive.ql.exec.MoveTask

org.apache.hadoop.hive.ql.exec.mr.ExecDriver

org.apache.hadoop.hive.ql.exec.mr.ExecMapper

org.apache.hadoop.hive.ql.exec.tez.DagUtils

org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator

org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor

org.apache.hadoop.hive.ql.exec.tez.MergeFileRecordProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.