Examples of org.apache.hadoop.hive.ql.plan.MapWork

org.apache.hadoop.hive.ql.plan.MapWork
MapWork represents all the information used to run a map task on the cluster. It is first used when the query planner breaks the logical plan into tasks and used throughout physical optimization to track map-side operator plans, input paths, aliases, etc. ExecDriver will serialize the contents of this class and make sure it is distributed on the cluster. The ExecMapper will ultimately deserialize this class on the data nodes and setup it's operator pipeline accordingly. This class is also used in the explain command any property with the appropriate annotation will be displayed in the explain output.

    Driver driver = new Driver(queryConf);
    driver.compile(qlCommand.toString(), false);


    if (pctx.getConf().getBoolVar(ConfVars.HIVE_INDEX_COMPACT_BINARY_SEARCH) && useSorted) {
      // For now, only works if the predicate is a single condition
      MapWork work = null;
      String originalInputFormat = null;
      for (Task task : driver.getPlan().getRootTasks()) {
        // The index query should have one and only one map reduce task in the root tasks
        // Otherwise something is wrong, log the problem and continue using the default format
        if (task.getWork() instanceof MapredWork) {
          if (work != null) {
            LOG.error("Tried to use a binary search on a compact index but there were an " +
                      "unexpected number (>1) of root level map reduce tasks in the " +
                      "reentrant query plan.");
            work.setInputformat(null);
            work.setInputFormatSorted(false);
            break;
          }
          if (task.getWork() != null) {
            work = ((MapredWork)task.getWork()).getMapWork();
          }
          String inputFormat = work.getInputformat();
          originalInputFormat = inputFormat;
          if (inputFormat == null) {
            inputFormat = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEINPUTFORMAT);
          }


          // We can only perform a binary search with HiveInputFormat and CombineHiveInputFormat
          // and BucketizedHiveInputFormat
          try {
            if (!HiveInputFormat.class.isAssignableFrom(Class.forName(inputFormat))) {
              work = null;
              break;
            }
          } catch (ClassNotFoundException e) {
            LOG.error("Map reduce work's input format class: " + inputFormat + " was not found. " +
                       "Cannot use the fact the compact index is sorted.");
            work = null;
            break;
          }


          work.setInputFormatSorted(true);
        }
      }


      if (work != null) {
        // Find the filter operator and expr node which act on the index column and mark them
        if (!findIndexColumnFilter(work.getAliasToWork().values())) {
          LOG.error("Could not locate the index column's filter operator and expr node. Cannot " +
                    "use the fact the compact index is sorted.");
          work.setInputformat(originalInputFormat);
          work.setInputFormatSorted(false);
        }
      }
    }

View Full Code Here


  // loop over all the tasks recursively
  @Override
  protected void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof ExecDriver) {
      MapWork work = ((MapredWork) task.getWork()).getMapWork();
      HashMap<String, Operator<? extends OperatorDesc>> opMap = work.getAliasToWork();
      if (!opMap.isEmpty()) {
        for (Operator<? extends OperatorDesc> op : opMap.values()) {
          setInputFormat(work, op);
        }
      }

View Full Code Here

    Driver driver = new Driver(queryConf);
    driver.compile(qlCommand.toString(), false);


    if (pctx.getConf().getBoolVar(ConfVars.HIVE_INDEX_COMPACT_BINARY_SEARCH) && useSorted) {
      // For now, only works if the predicate is a single condition
      MapWork work = null;
      String originalInputFormat = null;
      for (Task task : driver.getPlan().getRootTasks()) {
        // The index query should have one and only one map reduce task in the root tasks
        // Otherwise something is wrong, log the problem and continue using the default format
        if (task.getWork() instanceof MapredWork) {
          if (work != null) {
            LOG.error("Tried to use a binary search on a compact index but there were an " +
                      "unexpected number (>1) of root level map reduce tasks in the " +
                      "reentrant query plan.");
            work.setInputformat(null);
            work.setInputFormatSorted(false);
            break;
          }
          if (task.getWork() != null) {
            work = ((MapredWork)task.getWork()).getMapWork();
          }
          String inputFormat = work.getInputformat();
          originalInputFormat = inputFormat;
          if (inputFormat == null) {
            inputFormat = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEINPUTFORMAT);
          }


          // We can only perform a binary search with HiveInputFormat and CombineHiveInputFormat
          // and BucketizedHiveInputFormat
          try {
            if (!HiveInputFormat.class.isAssignableFrom(Class.forName(inputFormat))) {
              work = null;
              break;
            }
          } catch (ClassNotFoundException e) {
            LOG.error("Map reduce work's input format class: " + inputFormat + " was not found. " +
                       "Cannot use the fact the compact index is sorted.");
            work = null;
            break;
          }


          work.setInputFormatSorted(true);
        }
      }


      if (work != null) {
        // Find the filter operator and expr node which act on the index column and mark them
        if (!findIndexColumnFilter(work.getAliasToWork().values())) {
          LOG.error("Could not locate the index column's filter operator and expr node. Cannot " +
                    "use the fact the compact index is sorted.");
          work.setInputformat(originalInputFormat);
          work.setInputFormatSorted(false);
        }
      }
    }

View Full Code Here


    try {
      jc = job;
      execContext.setJc(jc);
      // create map and fetch operators
      MapWork mrwork = (MapWork) cache.retrieve(PLAN_KEY);
      if (mrwork == null) {
        mrwork = Utilities.getMapWork(job);
        cache.cache(PLAN_KEY, mrwork);
      } else {
        Utilities.setMapWork(job, mrwork);
      }
      if (mrwork.getVectorMode()) {
        mo = new VectorMapOperator();
      } else {
        mo = new MapOperator();
      }
      mo.setConf(mrwork);
      // initialize map operator
      mo.setChildren(job);
      l4j.info(mo.dump(0));
      // initialize map local work
      localWork = mrwork.getMapLocalWork();
      execContext.setLocalWork(localWork);


      MapredContext.init(true, new JobConf(jc));


      mo.setExecContext(execContext);

View Full Code Here

        String.format("Warning: %s", msg));
  }


  private void checkMapJoins(MapRedTask mrTsk) throws SemanticException {
    MapredWork mrWrk = mrTsk.getWork();
    MapWork mapWork = mrWrk.getMapWork();
    List<String> warnings = new MapJoinCheck(mrTsk.toString()).analyze(mapWork);
    if (!warnings.isEmpty()) {
      for (String w : warnings) {
        warn(w);
      }

View Full Code Here

      // modify the parse context to use indexing
      // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
      HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);


      // prepare the map reduce job to use indexing
      MapWork work = currentTask.getWork().getMapWork();
      work.setInputformat(queryContext.getIndexInputFormat());
      work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
      // modify inputs based on index query
      Set<ReadEntity> inputs = pctx.getSemanticInputs();
      inputs.addAll(queryContext.getAdditionalSemanticInputs());
      List<Task<?>> chosenRewrite = queryContext.getQueryTasks();

View Full Code Here

        = walkerCtx.getMetadataOnlyTableScans().iterator();


      while (iterator.hasNext()) {
        TableScanOperator tso = iterator.next();
        ((TableScanDesc)tso.getConf()).setIsMetadataOnly(true);
        MapWork work = ((MapredWork) task.getWork()).getMapWork();
        String alias = getAliasForTableScanOperator(work, tso);
        LOG.info("Metadata only table scan for " + alias);
        processAlias(work, alias);
      }

View Full Code Here


    Context ctx = driverContext.getCtx();
    boolean ctxCreated = false;
    Path emptyScratchDir;


    MapWork mWork = work.getMapWork();
    ReduceWork rWork = work.getReduceWork();


    try {
      if (ctx == null) {
        ctx = new Context(job);
        ctxCreated = true;
      }


      emptyScratchDir = ctx.getMRTmpPath();
      FileSystem fs = emptyScratchDir.getFileSystem(job);
      fs.mkdirs(emptyScratchDir);
    } catch (IOException e) {
      e.printStackTrace();
      console.printError("Error launching map-reduce job", "\n"
          + org.apache.hadoop.util.StringUtils.stringifyException(e));
      return 5;
    }


    ShimLoader.getHadoopShims().prepareJobOutput(job);
    //See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
    job.setOutputFormat(HiveOutputFormatImpl.class);


    job.setMapperClass(ExecMapper.class);


    job.setMapOutputKeyClass(HiveKey.class);
    job.setMapOutputValueClass(BytesWritable.class);


    try {
      job.setPartitionerClass((Class<? extends Partitioner>) (Class.forName(HiveConf.getVar(job,
          HiveConf.ConfVars.HIVEPARTITIONER))));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage());
    }


    if (mWork.getNumMapTasks() != null) {
      job.setNumMapTasks(mWork.getNumMapTasks().intValue());
    }


    if (mWork.getMaxSplitSize() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mWork.getMaxSplitSize().longValue());
    }


    if (mWork.getMinSplitSize() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mWork.getMinSplitSize().longValue());
    }


    if (mWork.getMinSplitSizePerNode() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mWork.getMinSplitSizePerNode().longValue());
    }


    if (mWork.getMinSplitSizePerRack() != null) {
      HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mWork.getMinSplitSizePerRack().longValue());
    }


    job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
    job.setReducerClass(ExecReducer.class);


    // set input format information if necessary
    setInputAttributes(job);


    // Turn on speculative execution for reducers
    boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job,
        HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS,
        useSpeculativeExecReducers);


    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
    if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) {
      inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName();
    }


    if (mWork.isUseBucketizedHiveInputFormat()) {
      inpFormat = BucketizedHiveInputFormat.class.getName();
    }


    LOG.info("Using " + inpFormat);


    try {
      job.setInputFormat((Class<? extends InputFormat>) (Class.forName(inpFormat)));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage());
    }




    // No-Op - we don't really write anything here ..
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);


    // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands
    // it
    String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS);
    String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS);
    if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) {
      String allJars = StringUtils.isNotBlank(auxJars) ? (StringUtils.isNotBlank(addedJars) ? addedJars
          + "," + auxJars
          : auxJars)
          : addedJars;
      LOG.info("adding libjars: " + allJars);
      initializeFiles("tmpjars", allJars);
    }


    // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it
    String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES);
    if (StringUtils.isNotBlank(addedFiles)) {
      initializeFiles("tmpfiles", addedFiles);
    }
    int returnVal = 0;
    boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME));


    if (noName) {
      // This is for a special case to ensure unit tests pass
      HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt());
    }
    String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES);
    // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it
    if (StringUtils.isNotBlank(addedArchives)) {
      initializeFiles("tmparchives", addedArchives);
    }


    try{
      MapredLocalWork localwork = mWork.getMapLocalWork();
      if (localwork != null && localwork.hasStagedAlias()) {
        if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
          Path localPath = localwork.getTmpPath();
          Path hdfsPath = mWork.getTmpHDFSPath();


          FileSystem hdfs = hdfsPath.getFileSystem(job);
          FileSystem localFS = localPath.getFileSystem(job);
          FileStatus[] hashtableFiles = localFS.listStatus(localPath);
          int fileNumber = hashtableFiles.length;
          String[] fileNames = new String[fileNumber];


          for ( int i = 0; i < fileNumber; i++){
            fileNames[i] = hashtableFiles[i].getPath().getName();
          }


          //package and compress all the hashtable files to an archive file
          String stageId = this.getId();
          String archiveFileName = Utilities.generateTarFileName(stageId);
          localwork.setStageID(stageId);


          CompressionUtils.tar(localPath.toUri().getPath(), fileNames,archiveFileName);
          Path archivePath = Utilities.generateTarPath(localPath, stageId);
          LOG.info("Archive "+ hashtableFiles.length+" hash table files to " + archivePath);


          //upload archive file to hdfs
          Path hdfsFilePath =Utilities.generateTarPath(hdfsPath, stageId);
          short replication = (short) job.getInt("mapred.submit.replication", 10);
          hdfs.setReplication(hdfsFilePath, replication);
          hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
          LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);


          //add the archive file to distributed cache
          DistributedCache.createSymlink(job);
          DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
          LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
        }
      }
      work.configureJobConf(job);
      List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx);
      Utilities.setInputPaths(job, inputPaths);


      Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());


      if (mWork.getSamplingType() > 0 && rWork != null && rWork.getNumReduceTasks() > 1) {
        try {
          handleSampling(driverContext, mWork, job, conf);
          job.setPartitionerClass(HiveTotalOrderPartitioner.class);
        } catch (IllegalStateException e) {
          console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        } catch (Exception e) {
          LOG.error("Sampling error", e);
          console.printError(e.toString(),
              "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        }
      }


      // remove the pwd from conf file so that job tracker doesn't show this
      // logs
      String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
      }
      JobClient jc = new JobClient(job);
      // make this client wait if job tracker is not behaving well.
      Throttle.checkJobTracker(job, LOG);


      if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
        // initialize stats publishing table
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(job);
        if (factory != null) {
          statsPublisher = factory.getStatsPublisher();
          if (!statsPublisher.init(job)) { // creating stats table if not exists
            if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
              throw
                new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
            }
          }
        }
      }


      Utilities.createTmpDirs(job, mWork);
      Utilities.createTmpDirs(job, rWork);


      // Finally SUBMIT the JOB!
      rj = jc.submitJob(job);
      // replace it back
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd);
      }


      returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager());
      success = (returnVal == 0);
    } catch (Exception e) {
      e.printStackTrace();
      String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
      if (rj != null) {
        mesg = "Ended Job = " + rj.getJobID() + mesg;
      } else {
        mesg = "Job Submission failed" + mesg;
      }


      // Has to use full name to make sure it does not conflict with
      // com.facebook.presto.hive.shaded.org.apache.commons.lang.StringUtils
      console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));


      success = false;
      returnVal = 1;
    } finally {
      Utilities.clearWork(job);
      try {
        if (ctxCreated) {
          ctx.clear();
        }


        if (rj != null) {
          if (returnVal != 0) {
            rj.killJob();
          }
          HadoopJobExecHelper.runningJobs.remove(rj);
          jobID = rj.getID().toString();
        }
      } catch (Exception e) {
      }
    }


    // get the list of Dynamic partition paths
    try {
      if (rj != null) {
        if (mWork.getAliasToWork() != null) {
          for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
            op.jobClose(job, success);
          }
        }
        if (rWork != null) {
          rWork.getReducer().jobClose(job, success);

View Full Code Here


  /**
   * Set hive input format, and input format file if necessary.
   */
  protected void setInputAttributes(Configuration conf) {
    MapWork mWork = work.getMapWork();
    if (mWork.getInputformat() != null) {
      HiveConf.setVar(conf, ConfVars.HIVEINPUTFORMAT, mWork.getInputformat());
    }
    if (mWork.getIndexIntermediateFile() != null) {
      conf.set("hive.index.compact.file", mWork.getIndexIntermediateFile());
      conf.set("hive.index.blockfilter.file", mWork.getIndexIntermediateFile());
    }


    // Intentionally overwrites anything the user may have put here
    conf.setBoolean("hive.input.format.sorted", mWork.isInputFormatSorted());


    if (HiveConf.getVar(conf, ConfVars.HIVE_CURRENT_DATABASE, null) == null) {
      HiveConf.setVar(conf, ConfVars.HIVE_CURRENT_DATABASE, getCurrentDB());
    }
  }

View Full Code Here

    if (task instanceof TezTask) {
      TezWork work = ((TezTask)task).getWork();
      List<BaseWork> all = work.getAllWork();
      for (BaseWork w: all) {
        if (w instanceof MapWork) {
          MapWork mapWork = (MapWork) w;
          HashMap<String, Operator<? extends OperatorDesc>> opMap = mapWork.getAliasToWork();
          if (!opMap.isEmpty()) {
            for (Operator<? extends OperatorDesc> op : opMap.values()) {
              setInputFormat(mapWork, op);
            }
          }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.MapWork

com.linkedin.haivvreo.AvroGenericRecordReader

com.linkedin.haivvreo.AvroSerDe

org.apache.hadoop.hive.ql.DriverContext

org.apache.hadoop.hive.ql.exec.MoveTask

org.apache.hadoop.hive.ql.exec.mr.ExecDriver

org.apache.hadoop.hive.ql.exec.mr.ExecMapper

org.apache.hadoop.hive.ql.exec.tez.DagUtils

org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator

org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor

org.apache.hadoop.hive.ql.exec.tez.MergeFileRecordProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.