Examples of org.apache.hadoop.hive.ql.plan.ReduceWork

org.apache.hadoop.hive.ql.plan.ReduceWork
ReduceWork represents all the information used to run a reduce task on the cluster. It is first used when the query planner breaks the logical plan into tasks and used throughout physical optimization to track reduce-side operator plans, schema info about key/value pairs, etc ExecDriver will serialize the contents of this class and make sure it is distributed on the cluster. The ExecReducer will ultimately deserialize this class on the data nodes and setup it's operator pipeline accordingly. This class is also used in the explain command any property with the appropriate annotation will be displayed in the explain output.

    Context ctx = driverContext.getCtx();
    boolean ctxCreated = false;
    Path emptyScratchDir;


    MapWork mWork = work.getMapWork();
    ReduceWork rWork = work.getReduceWork();


    try {
      if (ctx == null) {
        ctx = new Context(job);
        ctxCreated = true;
      }


      emptyScratchDir = ctx.getMRTmpPath();
      FileSystem fs = emptyScratchDir.getFileSystem(job);
      fs.mkdirs(emptyScratchDir);
    } catch (IOException e) {
      e.printStackTrace();
      console.printError("Error launching map-reduce job", "\n"
          + org.apache.hadoop.util.StringUtils.stringifyException(e));
      return 5;
    }


    ShimLoader.getHadoopShims().prepareJobOutput(job);
    //See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
    job.setOutputFormat(HiveOutputFormatImpl.class);


    job.setMapperClass(ExecMapper.class);


    job.setMapOutputKeyClass(HiveKey.class);
    job.setMapOutputValueClass(BytesWritable.class);


    try {
      job.setPartitionerClass((Class<? extends Partitioner>) (Class.forName(HiveConf.getVar(job,
          HiveConf.ConfVars.HIVEPARTITIONER))));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage());
    }


    if (mWork.getNumMapTasks() != null) {
      job.setNumMapTasks(mWork.getNumMapTasks().intValue());
    }


    if (mWork.getMaxSplitSize() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mWork.getMaxSplitSize().longValue());
    }


    if (mWork.getMinSplitSize() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mWork.getMinSplitSize().longValue());
    }


    if (mWork.getMinSplitSizePerNode() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mWork.getMinSplitSizePerNode().longValue());
    }


    if (mWork.getMinSplitSizePerRack() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mWork.getMinSplitSizePerRack().longValue());
    }


    job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
    job.setReducerClass(ExecReducer.class);


    // set input format information if necessary
    setInputAttributes(job);


    // Turn on speculative execution for reducers
    boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job,
        HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS,
        useSpeculativeExecReducers);


    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
    if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) {
      inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName();
    }


    if (mWork.isUseBucketizedHiveInputFormat()) {
      inpFormat = BucketizedHiveInputFormat.class.getName();
    }


    LOG.info("Using " + inpFormat);


    try {
      job.setInputFormat((Class<? extends InputFormat>) (Class.forName(inpFormat)));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage());
    }




    // No-Op - we don't really write anything here ..
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);


    // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands
    // it
    String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS);
    String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS);
    if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) {
      String allJars = StringUtils.isNotBlank(auxJars) ? (StringUtils.isNotBlank(addedJars) ? addedJars
          + "," + auxJars
          : auxJars)
          : addedJars;
      LOG.info("adding libjars: " + allJars);
      initializeFiles("tmpjars", allJars);
    }


    // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it
    String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES);
    if (StringUtils.isNotBlank(addedFiles)) {
      initializeFiles("tmpfiles", addedFiles);
    }
    int returnVal = 0;
    boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME));


    if (noName) {
      // This is for a special case to ensure unit tests pass
      HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt());
    }
    String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES);
    // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it
    if (StringUtils.isNotBlank(addedArchives)) {
      initializeFiles("tmparchives", addedArchives);
    }


    try{
      MapredLocalWork localwork = mWork.getMapRedLocalWork();
      if (localwork != null && localwork.hasStagedAlias()) {
        if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
          Path localPath = localwork.getTmpPath();
          Path hdfsPath = mWork.getTmpHDFSPath();


          FileSystem hdfs = hdfsPath.getFileSystem(job);
          FileSystem localFS = localPath.getFileSystem(job);
          FileStatus[] hashtableFiles = localFS.listStatus(localPath);
          int fileNumber = hashtableFiles.length;
          String[] fileNames = new String[fileNumber];


          for ( int i = 0; i < fileNumber; i++){
            fileNames[i] = hashtableFiles[i].getPath().getName();
          }


          //package and compress all the hashtable files to an archive file
          String stageId = this.getId();
          String archiveFileName = Utilities.generateTarFileName(stageId);
          localwork.setStageID(stageId);


          CompressionUtils.tar(localPath.toUri().getPath(), fileNames,archiveFileName);
          Path archivePath = Utilities.generateTarPath(localPath, stageId);
          LOG.info("Archive "+ hashtableFiles.length+" hash table files to " + archivePath);


          //upload archive file to hdfs
          Path hdfsFilePath =Utilities.generateTarPath(hdfsPath, stageId);
          short replication = (short) job.getInt("mapred.submit.replication", 10);
          hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
          hdfs.setReplication(hdfsFilePath, replication);
          LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);


          //add the archive file to distributed cache
          DistributedCache.createSymlink(job);
          DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
          LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
        }
      }
      work.configureJobConf(job);
      List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
      Utilities.setInputPaths(job, inputPaths);


      Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());


      if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
        try {
          handleSampling(driverContext, mWork, job, conf);
          job.setPartitionerClass(HiveTotalOrderPartitioner.class);
        } catch (IllegalStateException e) {
          console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        } catch (Exception e) {
          LOG.error("Sampling error", e);
          console.printError(e.toString(),
              "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        }
      }


      // remove the pwd from conf file so that job tracker doesn't show this
      // logs
      String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
      }
      JobClient jc = new JobClient(job);
      // make this client wait if job tracker is not behaving well.
      Throttle.checkJobTracker(job, LOG);


      if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
        // initialize stats publishing table
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(job);
        if (factory != null) {
          statsPublisher = factory.getStatsPublisher();
          if (!statsPublisher.init(job)) { // creating stats table if not exists
            if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
              throw
                new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
            }
          }
        }
      }


      Utilities.createTmpDirs(job, mWork);
      Utilities.createTmpDirs(job, rWork);


      SessionState ss = SessionState.get();
      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")
          && ss != null) {
        TezSessionState session = ss.getTezSession();
        TezSessionPoolManager.getInstance().close(session, true);
      }


      // Finally SUBMIT the JOB!
      rj = jc.submitJob(job);
      // replace it back
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd);
      }


      returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager());
      success = (returnVal == 0);
    } catch (Exception e) {
      e.printStackTrace();
      String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
      if (rj != null) {
        mesg = "Ended Job = " + rj.getJobID() + mesg;
      } else {
        mesg = "Job Submission failed" + mesg;
      }


      // Has to use full name to make sure it does not conflict with
      // org.apache.commons.lang.StringUtils
      console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));


      success = false;
      returnVal = 1;
    } finally {
      Utilities.clearWork(job);
      try {
        if (ctxCreated) {
          ctx.clear();
        }


        if (rj != null) {
          if (returnVal != 0) {
            rj.killJob();
          }
          HadoopJobExecHelper.runningJobs.remove(rj);
          jobID = rj.getID().toString();
        }
      } catch (Exception e) {
      }
    }


    // get the list of Dynamic partition paths
    try {
      if (rj != null) {
        if (mWork.getAliasToWork() != null) {
          for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
            op.jobClose(job, success);
          }
        }
        if (rWork != null) {
          rWork.getReducer().jobClose(job, success);
        }
      }
    } catch (Exception e) {
      // jobClose needs to execute successfully otherwise fail task
      if (success) {

View Full Code Here

        getMapJoinOperator(newTask, newWork, smbJoinOp, joinTree, bigTablePosition);


    // The reducer needs to be restored - Consider a query like:
    // select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
    // The reducer contains a groupby, which needs to be restored.
    ReduceWork rWork = newWork.getReduceWork();


    // create the local work for this plan
    MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);


    // restore the reducer

View Full Code Here

    return true;
  }


  private JoinOperator getJoinOp(MapRedTask task) throws SemanticException {
    MapWork mWork = task.getWork().getMapWork();
    ReduceWork rWork = task.getWork().getReduceWork();
    if (rWork == null) {
      return null;
    }
    Operator<? extends OperatorDesc> reducerOp = rWork.getReducer();
    if (reducerOp instanceof JoinOperator) {
      /* Is any operator present, which prevents the conversion */
      Map<String, Operator<? extends OperatorDesc>> aliasToWork = mWork.getAliasToWork();
      for (Operator<? extends OperatorDesc> op : aliasToWork.values()) {
        if (!checkOperatorOKMapJoinConversion(op)) {

View Full Code Here

    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    Operator<? extends OperatorDesc> currTopOp = opProcCtx.getCurrTopOp();


    opTaskMap.put(reducer, currTask);
    plan.setReduceWork(new ReduceWork());
    plan.getReduceWork().setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();


    plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());

View Full Code Here

    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();


    opTaskMap.put(reducer, unionTask);


    plan.setReduceWork(new ReduceWork());
    plan.getReduceWork().setReducer(reducer);
    plan.getReduceWork().setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();


    plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());

View Full Code Here

    Task<? extends Serializable> childTask = TaskFactory.get(childPlan, parseCtx
        .getConf());
    Operator<? extends OperatorDesc> reducer = cRS.getChildOperators().get(0);


    // Add the reducer
    ReduceWork rWork = new ReduceWork();
    childPlan.setReduceWork(rWork);
    rWork.setReducer(reducer);
    ReduceSinkDesc desc = cRS.getConf();
    childPlan.getReduceWork().setNumReduceTasks(new Integer(desc.getNumReducers()));


    opProcCtx.getOpTaskMap().put(reducer, childTask);

View Full Code Here

        } else {
          // There are separate configuration parameters to control whether to
          // merge for a map-only job
          // or for a map-reduce job
          if (currTask.getWork() instanceof MapredWork) {
            ReduceWork reduceWork = ((MapredWork) currTask.getWork()).getReduceWork();
            boolean mergeMapOnly =
              hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && reduceWork == null;
            boolean mergeMapRed =
              hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) &&
              reduceWork != null;

View Full Code Here

        .getReduceSinkDesc(Utilities.makeList(getStringColumn("key")),
        Utilities.makeList(getStringColumn("value")), outputColumns, true,
        -1, 1, -1, AcidUtils.Operation.NOT_ACID));


    addMapWork(mr, src, "a", op1);
    ReduceWork rWork = new ReduceWork();
    rWork.setNumReduceTasks(Integer.valueOf(1));
    rWork.setKeyDesc(op1.getConf().getKeySerializeInfo());
    rWork.getTagToValueDesc().add(op1.getConf().getValueSerializeInfo());
    mr.setReduceWork(rWork);


    // reduce side work
    Operator<FileSinkDesc> op3 = OperatorFactory.get(new FileSinkDesc(new Path(tmpdir + File.separator
        + "mapredplan1.out"), Utilities.defaultTd, false));


    Operator<ExtractDesc> op2 = OperatorFactory.get(new ExtractDesc(
        getStringColumn(Utilities.ReduceField.VALUE.toString())), op3);


    rWork.setReducer(op2);
  }

View Full Code Here

        Utilities
        .makeList(getStringColumn("key"), getStringColumn("value")),
        outputColumns, false, -1, 1, -1, AcidUtils.Operation.NOT_ACID));


    addMapWork(mr, src, "a", op1);
    ReduceWork rWork = new ReduceWork();
    rWork.setNumReduceTasks(Integer.valueOf(1));
    rWork.setKeyDesc(op1.getConf().getKeySerializeInfo());
    rWork.getTagToValueDesc().add(op1.getConf().getValueSerializeInfo());
    mr.setReduceWork(rWork);


    // reduce side work
    Operator<FileSinkDesc> op4 = OperatorFactory.get(new FileSinkDesc(new Path(tmpdir + File.separator
        + "mapredplan2.out"), Utilities.defaultTd, false));


    Operator<FilterDesc> op3 = OperatorFactory.get(getTestFilterDesc("0"), op4);


    Operator<ExtractDesc> op2 = OperatorFactory.get(new ExtractDesc(
        getStringColumn(Utilities.ReduceField.VALUE.toString())), op3);


    rWork.setReducer(op2);
  }

View Full Code Here

        .getReduceSinkDesc(Utilities.makeList(getStringColumn("key")),
        Utilities.makeList(getStringColumn("key")), outputColumns, true,
        Byte.valueOf((byte) 1), Integer.MAX_VALUE, -1, AcidUtils.Operation.NOT_ACID));


    addMapWork(mr, src2, "b", op2);
    ReduceWork rWork = new ReduceWork();
    rWork.setNumReduceTasks(Integer.valueOf(1));
    rWork.setNeedsTagging(true);
    rWork.setKeyDesc(op1.getConf().getKeySerializeInfo());
    rWork.getTagToValueDesc().add(op1.getConf().getValueSerializeInfo());


    mr.setReduceWork(rWork);
    rWork.getTagToValueDesc().add(op2.getConf().getValueSerializeInfo());


    // reduce side work
    Operator<FileSinkDesc> op4 = OperatorFactory.get(new FileSinkDesc(new Path(tmpdir + File.separator
        + "mapredplan3.out"), Utilities.defaultTd, false));


    Operator<SelectDesc> op5 = OperatorFactory.get(new SelectDesc(Utilities
        .makeList(new ExprNodeFieldDesc(TypeInfoFactory.stringTypeInfo,
        new ExprNodeColumnDesc(TypeInfoFactory.getListTypeInfo(TypeInfoFactory.stringTypeInfo),
        Utilities.ReduceField.VALUE.toString(), "", false), "0", false)),
        Utilities.makeList(outputColumns.get(0))), op4);


    rWork.setReducer(op5);
  }

View Full Code Here

0 1 2 3 4 5 6 7

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.ReduceWork

org.apache.hadoop.hive.ql.DriverContext

org.apache.hadoop.hive.ql.exec.mr.ExecDriver

org.apache.hadoop.hive.ql.exec.mr.ExecReducer

org.apache.hadoop.hive.ql.exec.mr.MapRedTask

org.apache.hadoop.hive.ql.exec.TestExecDriver

org.apache.hadoop.hive.ql.exec.tez.TestTezTask

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinTaskDispatcher

org.apache.hadoop.hive.ql.optimizer.physical.CrossProductCheck

org.apache.hadoop.hive.ql.optimizer.physical.SamplingOptimizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.