Examples of org.apache.hadoop.hive.ql.exec.FileSinkOperator

org.apache.hadoop.hive.ql.exec.FileSinkOperator
File Sink operator implementation.

      this.pctx = pctx;
    }


    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FileSinkOperator FS = (FileSinkOperator) nd;
      GroupByOperator cGBY = (GroupByOperator) stack.get(stack.size() - 3);
      ReduceSinkOperator RS = (ReduceSinkOperator) stack.get(stack.size() - 4);
      if (RS.getConf().getNumReducers() != 1 || !RS.getConf().getKeyCols().isEmpty()) {
        return null;
      }
      GroupByOperator pGBY = (GroupByOperator) stack.get(stack.size() - 5);


      Path fileName = FS.getConf().getFinalDirName();
      TableDesc tsDesc = createIntermediateFS(pGBY, fileName);


      for (AggregationDesc aggregation : cGBY.getConf().getAggregators()) {
        List<ExprNodeDesc> parameters = aggregation.getParameters();
        aggregation.setParameters(ExprNodeDescUtils.backtrack(parameters, cGBY, RS));

View Full Code Here

      TableDesc tsDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils
          .getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));


      // Create a file sink operator for this file name
      FileSinkDesc desc = new FileSinkDesc(fileName, tsDesc, false);
      FileSinkOperator newFS = (FileSinkOperator) OperatorFactory.get(desc, parent.getSchema());


      newFS.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>());
      newFS.getParentOperators().add(parent);


      parent.getChildOperators().clear();
      parent.getChildOperators().add(newFS);


      return tsDesc;

View Full Code Here

    while(!operators.isEmpty()) {
      Operator<?> current = operators.pop();
      seen.add(current);


      if (current instanceof FileSinkOperator) {
        FileSinkOperator fileSink = (FileSinkOperator)current;


        // remember it for additional processing later
        context.fileSinkSet.add(fileSink);


        FileSinkDesc desc = fileSink.getConf();
        Path path = desc.getDirName();
        List<FileSinkDesc> linked;


        if (!context.linkedFileSinks.containsKey(path)) {
          linked = new ArrayList<FileSinkDesc>();

View Full Code Here

    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    ctx.addRootIfPossible(currTask);


    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
        GenMapRedUtils.isInsertInto(parseCtx, fsOp);
    HiveConf hconf = parseCtx.getConf();


    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork)currTask.getWork()).setFinalMapRed(true);


    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
      Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
      processLinkedFileDesc(ctx, childTask);
      return true;
    }


    // In case of unions or map-joins, it is possible that the file has
    // already been seen.
    // So, no need to attempt to merge the files again.
    if ((ctx.getSeenFileSinkOps() == null)
        || (!ctx.getSeenFileSinkOps().contains(nd))) {
      chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
    }


    Path finalName = processFS(fsOp, stack, opProcCtx, chDir);


    if (chDir) {
      // Merge the files in the destination table/partitions by creating Map-only merge job
      // If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
      // OrcFileStripeMerge task would be created.
      LOG.info("using CombineHiveInputformat for the merge job");
      GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName,
          ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(),
          hconf, currTask);
    }


    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
      Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks =
        ctx.getLinkedFileDescTasks();
      if (linkedFileDescTasks == null) {
        linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();

View Full Code Here

   */
  public static class ConstantPropagateFileSinkProc implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs)
        throws SemanticException {
      FileSinkOperator op = (FileSinkOperator) nd;
      ConstantPropagateProcCtx cppCtx = (ConstantPropagateProcCtx) ctx;
      Map<ColumnInfo, ExprNodeDesc> constants = cppCtx.getPropagatedConstants(op);
      cppCtx.getOpToConstantExprs().put(op, constants);
      if (constants.isEmpty()) {
        return null;
      }
      FileSinkDesc fsdesc = op.getConf();
      DynamicPartitionCtx dpCtx = fsdesc.getDynPartCtx();
      if (dpCtx != null) {


        // If all dynamic partitions are propagated as constant, remove DP.
        Set<String> inputs = dpCtx.getInputToDPCols().keySet();


        // Assume only 1 parent for FS operator
        Operator<? extends Serializable> parent = op.getParentOperators().get(0);
        Map<ColumnInfo, ExprNodeDesc> parentConstants = cppCtx.getPropagatedConstants(parent);
        RowResolver rr = cppCtx.getOpToParseCtxMap().get(parent).getRowResolver();
        boolean allConstant = true;
        for (String input : inputs) {
          String tmp[] = rr.reverseLookup(input);

View Full Code Here

          if (!(desc instanceof ExprNodeColumnDesc)) {
            // Probably an expression, cant handle that
            return null;
          }
        }
        FileSinkOperator fsOp = (FileSinkOperator)(selOp.getChildren().get(0));
        if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size() > 0) {
          // looks like a subq plan.
          return null;
        }


        Table tbl = pctx.getTopToTable().get(tsOp);

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      BucketingSortingCtx bctx = (BucketingSortingCtx)procCtx;
      FileSinkOperator fop = (FileSinkOperator)nd;


      Operator<? extends OperatorDesc> parent = getParent(stack);
      List<BucketCol> bucketCols = bctx.getBucketedCols(parent);
      List<ColumnInfo> colInfos = fop.getSchema().getSignature();


      // Set the inferred bucket columns for the file this FileSink produces
      if (bucketCols != null) {
        List<BucketCol> newBucketCols = getNewBucketCols(bucketCols, colInfos);
        bctx.getBucketedColsByDirectory().put(fop.getConf().getDirName().toString(), newBucketCols);
        bctx.setBucketedCols(fop, newBucketCols);
      }


      List<SortCol> sortCols = bctx.getSortedCols(parent);


      // Set the inferred sort columns for the file this FileSink produces
      if (sortCols != null) {
        List<SortCol> newSortCols = getNewSortCols(sortCols, colInfos);
        bctx.getSortedColsByDirectory().put(fop.getConf().getDirName().toString(), newSortCols);
        bctx.setSortedCols(fop, newSortCols);
      }


      return null;
    }

View Full Code Here


    Operator<? extends OperatorDesc> currOp = originalSMBJoinOp;
    while (true) {
      if ((currOp.getChildOperators() == null) || (currOp.getChildOperators().isEmpty())) {
        if (currOp instanceof FileSinkOperator) {
          FileSinkOperator fsOp = (FileSinkOperator)currOp;
          // The query has enforced that a sort-merge join should be performed.
          // For more details, look at 'removedReduceSinkBucketSort' in FileSinkDesc.java
          return !fsOp.getConf().isRemovedReduceSinkBucketSort();
        }


        // If it contains a reducer, the optimization is always on.
        // Since there exists a reducer, the sorting/bucketing properties due to the
        // sort-merge join operator are lost anyway. So, the plan cannot be wrong by

View Full Code Here

    if (mapJoinTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + mapJoinAlias +
          ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator =
        OperatorUtils.findSingleOperator(
            mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
      throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() +
          " operator at the last operator of the MapJoin Task.");
    }


    // The mapJoinTaskFileSinkOperator writes to a different directory
    String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName().toString();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
      return;
    }
    String childMRAlias = childMRAliases.get(0);


    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<String, ArrayList<String>> entry : childMapWork.getPathToAliases().entrySet()) {
      String path = entry.getKey();
      List<String> aliases = entry.getValue();


      if (path.equals(childMRPath)) {
        continue;
      }


      if (aliases.contains(mapJoinAlias)) {
        // alias confict should not happen here.
        return;
      }
    }


    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();


    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
        (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
      // Right now, we do not handle the case that either of them is bucketed.
      // We should relax this constraint with a follow-up jira.
      return;
    }


    // We need to check if the total size of local tables is under the limit.
    // At here, we are using a strong condition, which is the total size of
    // local tables used by all input paths. Actually, we can relax this condition
    // to check the total size of local tables for every input path.
    // Example:
    //               UNION_ALL
    //              /         \
    //             /           \
    //            /             \
    //           /               \
    //       MapJoin1          MapJoin2
    //      /   |   \         /   |   \
    //     /    |    \       /    |    \
    //   Big1   S1   S2    Big2   S3   S4
    // In this case, we have two MapJoins, MapJoin1 and MapJoin2. Big1 and Big2 are two
    // big tables, and S1, S2, S3, and S4 are four small tables. Hash tables of S1 and S2
    // will only be used by Map tasks processing Big1. Hash tables of S3 and S4 will only
    // be used by Map tasks processing Big2. If Big1!=Big2, we should only check if the size
    // of S1 + S2 is under the limit, and if the size of S3 + S4 is under the limit.
    // But, right now, we are checking the size of S1 + S2 + S3 + S4 is under the limit.
    // If Big1=Big2, we will only scan a path once. So, MapJoin1 and MapJoin2 will be executed
    // in the same Map task. In this case, we need to make sure the size of S1 + S2 + S3 + S4
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){
      // The total size of local tables may not be under
      // the limit after we merge mapJoinLocalWork and childLocalWork.
      // Do not merge.
      return;
    }


    TableScanOperator childMRTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            childMapWork.getAliasToWork().get(childMRAlias), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + childMRAlias +
          ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }


    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask =
        mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask =
        childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
      // Do not merge if we do not know how to connect two operator trees.
      return;

View Full Code Here

  public Object process(Node nd, Stack<Node> stack,
      NodeProcessorCtx procCtx, Object... nodeOutputs)
      throws SemanticException {


    GenTezProcContext context = (GenTezProcContext) procCtx;
    FileSinkOperator fileSink = (FileSinkOperator) nd;


    // just remember it for later processing
    context.fileSinkSet.add(fileSink);
    return true;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.FileSinkOperator

org.apache.hadoop.fs.FileStatus

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.ql.io.FSRecordWriter

org.apache.hadoop.hive.ql.optimizer.BucketingSortingReduceSinkOptimizer$BucketSortReduceSinkProcessor

org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory$ConstantPropagateFileSinkProc

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1

org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingOpProcFactory$FileSinkInferrer

org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinResolver$CommonJoinTaskDispatcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.