Examples of org.apache.hadoop.hive.ql.exec.FileSinkOperator

org.apache.hadoop.hive.ql.exec.FileSinkOperator
File Sink operator implementation.


    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts,
      conf.getBoolVar(ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(
      fsOutputDesc, inputRS, tsMerge);


    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.

View Full Code Here

    while(!operators.isEmpty()) {
      Operator<?> current = operators.pop();
      seen.add(current);


      if (current instanceof FileSinkOperator) {
        FileSinkOperator fileSink = (FileSinkOperator)current;


        // remember it for additional processing later
        context.fileSinkSet.add(fileSink);


        FileSinkDesc desc = fileSink.getConf();
        Path path = desc.getDirName();
        List<FileSinkDesc> linked;


        if (!context.linkedFileSinks.containsKey(path)) {
          linked = new ArrayList<FileSinkDesc>();

View Full Code Here

    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    ctx.addRootIfPossible(currTask);


    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
        GenMapRedUtils.isInsertInto(parseCtx, fsOp);
    HiveConf hconf = parseCtx.getConf();


    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork)currTask.getWork()).setFinalMapRed(true);


    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
      Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
      processLinkedFileDesc(ctx, childTask);
      return true;
    }


    // In case of unions or map-joins, it is possible that the file has
    // already been seen.
    // So, no need to attempt to merge the files again.
    if ((ctx.getSeenFileSinkOps() == null)
        || (!ctx.getSeenFileSinkOps().contains(nd))) {
      chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
    }


    Path finalName = processFS(fsOp, stack, opProcCtx, chDir);


    if (chDir) {
      // Merge the files in the destination table/partitions by creating Map-only merge job
      // If underlying data is RCFile a RCFileBlockMerge task would be created.
      LOG.info("using CombineHiveInputformat for the merge job");
      GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName,
          ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(),
          hconf, currTask);
    }


    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
      Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks =
        ctx.getLinkedFileDescTasks();
      if (linkedFileDescTasks == null) {
        linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();

View Full Code Here


    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts,
      conf.getBoolVar(ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(
      fsOutputDesc, inputRS, tsMerge);


    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.

View Full Code Here

        if (!(selOp.getConf().getColList().size() == aggrs.size())) {
          // all select columns must be aggregations
          return null;


        }
        FileSinkOperator fsOp = (FileSinkOperator)(selOp.getChildren().get(0));
        if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size() > 0) {
          // looks like a subq plan.
          return null;
        }


        Table tbl = pctx.getTopToTable().get(tsOp);

View Full Code Here

    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      // introduce RS and EX before FS. If the operator tree already contains
      // RS then ReduceSinkDeDuplication optimization should merge them
      FileSinkOperator fsOp = (FileSinkOperator) nd;


      LOG.info("Sorted dynamic partitioning optimization kicked in..");


      // if not dynamic partitioning then bail out
      if (fsOp.getConf().getDynPartCtx() == null) {
        LOG.debug("Bailing out of sort dynamic partition optimization as dynamic partitioning context is null");
        return null;
      }


      // if list bucketing then bail out
      ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx();
      if (lbCtx != null && !lbCtx.getSkewedColNames().isEmpty()
          && !lbCtx.getSkewedColValues().isEmpty()) {
        LOG.debug("Bailing out of sort dynamic partition optimization as list bucketing is enabled");
        return null;
      }


      Table destTable = parseCtx.getFsopToTable().get(fsOp);
      if (destTable == null) {
        LOG.debug("Bailing out of sort dynamic partition optimization as destination table is null");
        return null;
      }


      // if RS is inserted by enforce bucketing or sorting, we need to remove it
      // since ReduceSinkDeDuplication will not merge them to single RS.
      // RS inserted by enforce bucketing/sorting will have bucketing column in
      // reduce sink key whereas RS inserted by this optimization will have
      // partition columns followed by bucket number followed by sort columns in
      // the reduce sink key. Since both key columns are not prefix subset
      // ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs.
      // To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting.
      removeRSInsertedByEnforceBucketing(fsOp);


      // unlink connection between FS and its parent
      Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0);
      fsParent.getChildOperators().clear();


      DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx();
      int numBuckets = destTable.getNumBuckets();


      // if enforce bucketing/sorting is disabled numBuckets will not be set.
      // set the number of buckets here to ensure creation of empty buckets
      dpCtx.setNumBuckets(numBuckets);


      // Get the positions for partition, bucket and sort columns
      List<Integer> bucketPositions = getBucketPositions(destTable.getBucketCols(),
          destTable.getCols());
      ObjectPair<List<Integer>, List<Integer>> sortOrderPositions = getSortPositionsOrder(
          destTable.getSortCols(), destTable.getCols());
      List<Integer> sortPositions = sortOrderPositions.getFirst();
      List<Integer> sortOrder = sortOrderPositions.getSecond();
      List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema());
      List<ColumnInfo> colInfos = parseCtx.getOpParseCtx().get(fsParent).getRowResolver()
          .getColumnInfos();
      ArrayList<ExprNodeDesc> bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos);


      // update file sink descriptor
      fsOp.getConf().setMultiFileSpray(false);
      fsOp.getConf().setNumFiles(1);
      fsOp.getConf().setTotalFiles(1);


      // Create ReduceSinkDesc
      RowResolver inputRR = parseCtx.getOpParseCtx().get(fsParent).getRowResolver();
      ObjectPair<String, RowResolver> pair = copyRowResolver(inputRR);
      RowResolver outRR = pair.getSecond();
      ArrayList<ColumnInfo> valColInfo = Lists.newArrayList(fsParent.getSchema().getSignature());
      ArrayList<ExprNodeDesc> newValueCols = Lists.newArrayList();
      Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap();
      for (ColumnInfo ci : valColInfo) {
        newValueCols.add(new ExprNodeColumnDesc(ci.getType(), ci.getInternalName(), ci
            .getTabAlias(), ci.isHiddenVirtualCol()));
        colExprMap.put(ci.getInternalName(), newValueCols.get(newValueCols.size() - 1));
      }
      ReduceSinkDesc rsConf = getReduceSinkDesc(partitionPositions, sortPositions, sortOrder,
          newValueCols, bucketColumns, numBuckets, fsParent);


      // Create ReduceSink operator
      ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(
          OperatorFactory.getAndMakeChild(rsConf, new RowSchema(outRR.getColumnInfos()), fsParent),
          outRR, parseCtx);
      rsOp.setColumnExprMap(colExprMap);


      // Create ExtractDesc
      ObjectPair<String, RowResolver> exPair = copyRowResolver(outRR);
      RowResolver exRR = exPair.getSecond();
      ExtractDesc exConf = new ExtractDesc(new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
          Utilities.ReduceField.VALUE.toString(), "", false));


      // Create Extract Operator
      ExtractOperator exOp = (ExtractOperator) putOpInsertMap(
          OperatorFactory.getAndMakeChild(exConf, new RowSchema(exRR.getColumnInfos()), rsOp),
          exRR, parseCtx);


      // link EX to FS
      fsOp.getParentOperators().clear();
      fsOp.getParentOperators().add(exOp);
      exOp.getChildOperators().add(fsOp);


      // Set if partition sorted or partition bucket sorted
      fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED);
      if (bucketColumns.size() > 0) {
        fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED);
      }


      // update partition column info in FS descriptor
      ArrayList<ExprNodeDesc> partitionColumns = getPositionsToExprNodes(partitionPositions, rsOp
          .getSchema().getSignature());
      fsOp.getConf().setPartitionCols(partitionColumns);


      LOG.info("Inserted " + rsOp.getOperatorId() + " and " + exOp.getOperatorId()
          + " as parent of " + fsOp.getOperatorId() + " and child of " + fsParent.getOperatorId());
      return null;
    }

View Full Code Here

    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      // introduce RS and EX before FS. If the operator tree already contains
      // RS then ReduceSinkDeDuplication optimization should merge them
      FileSinkOperator fsOp = (FileSinkOperator) nd;


      LOG.info("Sorted dynamic partitioning optimization kicked in..");


      // if not dynamic partitioning then bail out
      if (fsOp.getConf().getDynPartCtx() == null) {
        LOG.debug("Bailing out of sort dynamic partition optimization as dynamic partitioning context is null");
        return null;
      }


      // if list bucketing then bail out
      ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx();
      if (lbCtx != null && !lbCtx.getSkewedColNames().isEmpty()
          && !lbCtx.getSkewedColValues().isEmpty()) {
        LOG.debug("Bailing out of sort dynamic partition optimization as list bucketing is enabled");
        return null;
      }


      Table destTable = parseCtx.getFsopToTable().get(fsOp);
      if (destTable == null) {
        LOG.debug("Bailing out of sort dynamic partition optimization as destination table is null");
        return null;
      }


      // if RS is inserted by enforce bucketing or sorting, we need to remove it
      // since ReduceSinkDeDuplication will not merge them to single RS.
      // RS inserted by enforce bucketing/sorting will have bucketing column in
      // reduce sink key whereas RS inserted by this optimization will have
      // partition columns followed by bucket number followed by sort columns in
      // the reduce sink key. Since both key columns are not prefix subset
      // ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs.
      // To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting.
      removeRSInsertedByEnforceBucketing(fsOp);


      // unlink connection between FS and its parent
      Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0);
      fsParent.getChildOperators().clear();


      DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx();
      int numBuckets = destTable.getNumBuckets();


      // if enforce bucketing/sorting is disabled numBuckets will not be set.
      // set the number of buckets here to ensure creation of empty buckets
      dpCtx.setNumBuckets(numBuckets);


      // Get the positions for partition, bucket and sort columns
      List<Integer> bucketPositions = getBucketPositions(destTable.getBucketCols(),
          destTable.getCols());
      ObjectPair<List<Integer>, List<Integer>> sortOrderPositions = getSortPositionsOrder(
          destTable.getSortCols(), destTable.getCols());
      List<Integer> sortPositions = sortOrderPositions.getFirst();
      List<Integer> sortOrder = sortOrderPositions.getSecond();
      List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema());
      List<ColumnInfo> colInfos = parseCtx.getOpParseCtx().get(fsParent).getRowResolver()
          .getColumnInfos();
      ArrayList<ExprNodeDesc> bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos);


      // update file sink descriptor
      fsOp.getConf().setMultiFileSpray(false);
      fsOp.getConf().setNumFiles(1);
      fsOp.getConf().setTotalFiles(1);


      // Create ReduceSinkDesc
      RowResolver inputRR = parseCtx.getOpParseCtx().get(fsParent).getRowResolver();
      ObjectPair<String, RowResolver> pair = copyRowResolver(inputRR);
      RowResolver outRR = pair.getSecond();
      ArrayList<ColumnInfo> valColInfo = Lists.newArrayList(fsParent.getSchema().getSignature());
      ArrayList<ExprNodeDesc> newValueCols = Lists.newArrayList();
      Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap();
      for (ColumnInfo ci : valColInfo) {
        newValueCols.add(new ExprNodeColumnDesc(ci.getType(), ci.getInternalName(), ci
            .getTabAlias(), ci.isHiddenVirtualCol()));
        colExprMap.put(ci.getInternalName(), newValueCols.get(newValueCols.size() - 1));
      }
      ReduceSinkDesc rsConf = getReduceSinkDesc(partitionPositions, sortPositions, sortOrder,
          newValueCols, bucketColumns, numBuckets, fsParent);


      // Create ReduceSink operator
      ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(
          OperatorFactory.getAndMakeChild(rsConf, new RowSchema(outRR.getColumnInfos()), fsParent),
          outRR, parseCtx);
      rsOp.setColumnExprMap(colExprMap);


      // Create ExtractDesc
      ObjectPair<String, RowResolver> exPair = copyRowResolver(outRR);
      RowResolver exRR = exPair.getSecond();
      ExtractDesc exConf = new ExtractDesc(new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
          Utilities.ReduceField.VALUE.toString(), "", false));


      // Create Extract Operator
      ExtractOperator exOp = (ExtractOperator) putOpInsertMap(
          OperatorFactory.getAndMakeChild(exConf, new RowSchema(exRR.getColumnInfos()), rsOp),
          exRR, parseCtx);


      // link EX to FS
      fsOp.getParentOperators().clear();
      fsOp.getParentOperators().add(exOp);
      exOp.getChildOperators().add(fsOp);


      // Set if partition sorted or partition bucket sorted
      fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED);
      if (bucketColumns.size() > 0) {
        fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED);
      }


      // update partition column info in FS descriptor
      ArrayList<ExprNodeDesc> partitionColumns = getPositionsToExprNodes(partitionPositions, rsOp
          .getSchema().getSignature());
      fsOp.getConf().setPartitionCols(partitionColumns);


      LOG.info("Inserted " + rsOp.getOperatorId() + " and " + exOp.getOperatorId()
          + " as parent of " + fsOp.getOperatorId() + " and child of " + fsParent.getOperatorId());
      return null;
    }

View Full Code Here

  public Object process(Node nd, Stack<Node> stack,
      NodeProcessorCtx procCtx, Object... nodeOutputs)
      throws SemanticException {


    GenTezProcContext context = (GenTezProcContext) procCtx;
    FileSinkOperator fileSink = (FileSinkOperator) nd;


    // just remember it for later processing
    context.fileSinkSet.add(fileSink);
    return true;
  }

View Full Code Here

    if (mapJoinTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + mapJoinAlias +
          ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator =
        OperatorUtils.findSingleOperator(
            mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
      throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() +
          " operator at the last operator of the MapJoin Task.");
    }


    // The mapJoinTaskFileSinkOperator writes to a different directory
    String childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
      return;
    }
    String childMRAlias = childMRAliases.get(0);


    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapLocalWork();


    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
        (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
      // Right now, we do not handle the case that either of them is bucketed.
      // We should relax this constraint with a follow-up jira.
      return;
    }


    // We need to check if the total size of local tables is under the limit.
    // At here, we are using a strong condition, which is the total size of
    // local tables used by all input paths. Actually, we can relax this condition
    // to check the total size of local tables for every input path.
    // Example:
    //               UNION_ALL
    //              /         \
    //             /           \
    //            /             \
    //           /               \
    //       MapJoin1          MapJoin2
    //      /   |   \         /   |   \
    //     /    |    \       /    |    \
    //   Big1   S1   S2    Big2   S3   S4
    // In this case, we have two MapJoins, MapJoin1 and MapJoin2. Big1 and Big2 are two
    // big tables, and S1, S2, S3, and S4 are four small tables. Hash tables of S1 and S2
    // will only be used by Map tasks processing Big1. Hash tables of S3 and S4 will only
    // be used by Map tasks processing Big2. If Big1!=Big2, we should only check if the size
    // of S1 + S2 is under the limit, and if the size of S3 + S4 is under the limit.
    // But, right now, we are checking the size of S1 + S2 + S3 + S4 is under the limit.
    // If Big1=Big2, we will only scan a path once. So, MapJoin1 and MapJoin2 will be executed
    // in the same Map task. In this case, we need to make sure the size of S1 + S2 + S3 + S4
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)){
      // The total size of local tables may not be under
      // the limit after we merge mapJoinLocalWork and childLocalWork.
      // Do not merge.
      return;
    }


    TableScanOperator childMRTaskTableScanOperator =
        OperatorUtils.findSingleOperator(
            childMapWork.getAliasToWork().get(childMRAlias), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
      throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() +
          " operator as the work associated with alias " + childMRAlias +
          ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }


    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask =
        mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask =
        childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
      // Do not merge if we do not know how to connect two operator trees.
      return;

View Full Code Here


    Operator<? extends OperatorDesc> currOp = originalSMBJoinOp;
    while (true) {
      if (currOp.getChildOperators() == null) {
        if (currOp instanceof FileSinkOperator) {
          FileSinkOperator fsOp = (FileSinkOperator)currOp;
          // The query has enforced that a sort-merge join should be performed.
          // For more details, look at 'removedReduceSinkBucketSort' in FileSinkDesc.java
          return !fsOp.getConf().isRemovedReduceSinkBucketSort();
        }


        // If it contains a reducer, the optimization is always on.
        // Since there exists a reducer, the sorting/bucketing properties due to the
        // sort-merge join operator are lost anyway. So, the plan cannot be wrong by

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.FileSinkOperator

org.apache.hadoop.fs.FileStatus

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.ql.io.FSRecordWriter

org.apache.hadoop.hive.ql.optimizer.BucketingSortingReduceSinkOptimizer$BucketSortReduceSinkProcessor

org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory$ConstantPropagateFileSinkProc

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1

org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingOpProcFactory$FileSinkInferrer

org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinResolver$CommonJoinTaskDispatcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.