Examples of org.apache.hadoop.hive.ql.exec.FileSinkOperator

org.apache.hadoop.hive.ql.exec.FileSinkOperator
File Sink operator implementation.

    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    ctx.addRootIfPossible(currTask);


    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
    fsOp.getConf().getTableInfo().getTableName() != null &&
        parseCtx.getQB().getParseInfo().isInsertToTable();
    HiveConf hconf = parseCtx.getConf();


    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork)currTask.getWork()).setFinalMapRed(true);


    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
      Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
      processLinkedFileDesc(ctx, childTask);
      return true;
    }


    // Has the user enabled merging of files for map-only jobs or for all jobs
    if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) {
      List<Task<MoveWork>> mvTasks = ctx.getMvTask();


      // In case of unions or map-joins, it is possible that the file has
      // already been seen.
      // So, no need to attempt to merge the files again.
      if ((ctx.getSeenFileSinkOps() == null)
          || (!ctx.getSeenFileSinkOps().contains(nd))) {


        // no need of merging if the move is to a local file system
        MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp);


        if (mvTask != null && isInsertTable && hconf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER)) {
          addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf());
        }


        if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) {
          if (fsOp.getConf().isLinkedFileSink()) {
            // If the user has HIVEMERGEMAPREDFILES set to false, the idea was the
            // number of reducers are few, so the number of files anyway are small.
            // However, with this optimization, we are increasing the number of files
            // possibly by a big margin. So, merge aggresively.
            if (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) ||
                hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES)) {
              chDir = true;
            }
          } else {
              // There are separate configuration parameters to control whether to
              // merge for a map-only job
              // or for a map-reduce job
              MapredWork currWork = (MapredWork) currTask.getWork();
              boolean mergeMapOnly =
                  hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && currWork.getReduceWork() == null;
              boolean mergeMapRed =
                  hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) &&
                      currWork.getReduceWork() != null;
              if (mergeMapOnly || mergeMapRed) {
                chDir = true;
              }
          }
        }
      }
    }


    String finalName = processFS(fsOp, stack, opProcCtx, chDir);


    if (chDir) {
      // Merge the files in the destination table/partitions by creating Map-only merge job
      // If underlying data is RCFile a RCFileBlockMerge task would be created.
      LOG.info("using CombineHiveInputformat for the merge job");
      createMRWorkForMergingFiles(fsOp, ctx, finalName);
    }


    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
      Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks =
        ctx.getLinkedFileDescTasks();
      if (linkedFileDescTasks == null) {
        linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();

View Full Code Here


    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts,
      conf.getBoolVar(ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(
      fsOutputDesc, inputRS, tsMerge);


    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.

View Full Code Here

      this.pctx = pctx;
    }


    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FileSinkOperator FS = (FileSinkOperator) nd;
      GroupByOperator cGBY = (GroupByOperator) stack.get(stack.size() - 3);
      ReduceSinkOperator RS = (ReduceSinkOperator) stack.get(stack.size() - 4);
      if (RS.getConf().getNumReducers() != 1 || !RS.getConf().getKeyCols().isEmpty()) {
        return null;
      }
      GroupByOperator pGBY = (GroupByOperator) stack.get(stack.size() - 5);


      String fileName = FS.getConf().getFinalDirName();
      TableDesc tsDesc = createIntermediateFS(pGBY, fileName);


      for (AggregationDesc aggregation : cGBY.getConf().getAggregators()) {
        List<ExprNodeDesc> parameters = aggregation.getParameters();
        aggregation.setParameters(ExprNodeDescUtils.backtrack(parameters, cGBY, pGBY));

View Full Code Here

      TableDesc tsDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils
          .getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));


      // Create a file sink operator for this file name
      FileSinkDesc desc = new FileSinkDesc(fileName, tsDesc, false);
      FileSinkOperator newFS = (FileSinkOperator) OperatorFactory.get(desc, parent.getSchema());


      newFS.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>());
      newFS.getParentOperators().add(parent);


      parent.getChildOperators().clear();
      parent.getChildOperators().add(newFS);


      return tsDesc;

View Full Code Here


      if (!(op instanceof FileSinkOperator)) {
        return;
      }


      FileSinkOperator fop = (FileSinkOperator)op;
      String workDir = fop.getConf().getDirName();


      Map<String, ArrayList<String>> childPathToAliases = childWork.getPathToAliases();
      if (childPathToAliases.size() > 1) {
        return;
      }


      // The filesink writes to a different directory
      if (!childPathToAliases.keySet().iterator().next().equals(workDir)) {
        return;
      }


      // Either of them should not be bucketed
      if ((localWork.getBucketMapjoinContext() != null) ||
          (childLocalWork.getBucketMapjoinContext() != null)) {
        return;
      }


      // Merge the trees
      if (childWork.getAliasToWork().size() > 1) {
        return;
      }
      long mapJoinSize = HiveConf.getLongVar(conf,
          HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
      long localTableTotalSize = 0;
      for (String alias : localWork.getAliasToWork().keySet()) {
        Long tabSize = aliasToSize.get(alias);
        if (tabSize == null) {
          /* if the size is unavailable, we need to assume a size 1 greater than mapJoinSize
           * this implies that merge cannot happen so we can return.
           */
          return;
        }
        localTableTotalSize += tabSize;
      }


      for (String alias : childLocalWork.getAliasToWork().keySet()) {
        Long tabSize = aliasToSize.get(alias);
        if (tabSize == null) {
          /* if the size is unavailable, we need to assume a size 1 greater than mapJoinSize
           * this implies that merge cannot happen so we can return.
           */
          return;
        }
        localTableTotalSize += tabSize;
        if (localTableTotalSize > mapJoinSize) {
          return;
        }
      }


      Operator<? extends Serializable> childAliasOp =
          childWork.getAliasToWork().values().iterator().next();
      if (fop.getParentOperators().size() > 1) {
        return;
      }


      // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the
      // top of the second
      Operator<? extends Serializable> parentFOp = fop.getParentOperators().get(0);
      parentFOp.getChildOperators().remove(fop);
      parentFOp.getChildOperators().add(childAliasOp);
      List<Operator<? extends OperatorDesc>> parentOps =
          new ArrayList<Operator<? extends OperatorDesc>>();
      parentOps.add(parentFOp);

View Full Code Here

      if (!(mapJoinLeafOperator instanceof FileSinkOperator)) {
        // Sanity check, shouldn't happen.
        return;
      }


      FileSinkOperator mapJoinTaskFileSinkOperator = (FileSinkOperator) mapJoinLeafOperator;


      // The filesink writes to a different directory
      String workDir = mapJoinTaskFileSinkOperator.getConf().getDirName();
      if (!childPathToAliases.keySet().iterator().next().equals(workDir)) {
        return;
      }


      MapredLocalWork mapJoinLocalWork = mapJoinWork.getMapLocalWork();
      MapredLocalWork childLocalWork = childWork.getMapLocalWork();


      // Either of them should not be bucketed
      if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
          (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
        return;
      }


      if (childWork.getAliasToWork().size() > 1) {
        return;
      }


      Operator<? extends Serializable> childAliasOp =
          childWork.getAliasToWork().values().iterator().next();
      if (mapJoinTaskFileSinkOperator.getParentOperators().size() > 1) {
        return;
      }


      // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the
      // top of the second
      Operator<? extends Serializable> parentFOp = mapJoinTaskFileSinkOperator
          .getParentOperators().get(0);
      parentFOp.getChildOperators().remove(mapJoinTaskFileSinkOperator);
      parentFOp.getChildOperators().add(childAliasOp);
      List<Operator<? extends OperatorDesc>> parentOps =
          new ArrayList<Operator<? extends OperatorDesc>>();

View Full Code Here

        // FileSink cannot be simply cloned - it requires some special processing.
        // Sub-queries for the union will be processed as independent map-reduce jobs
        // possibly running in parallel. Those sub-queries cannot write to the same
        // directory. Clone the filesink, but create a sub-directory in the final path
        // for each sub-query. Also, these different filesinks need to be linked to each other
        FileSinkOperator fileSinkOp = (FileSinkOperator)stack.get(pos);
        // For file sink operator, change the directory name
        String parentDirName = fileSinkOp.getConf().getDirName();


        // Clone the fileSinkDesc of the final fileSink and create similar fileSinks at
        // each parent
        List<FileSinkDesc> fileDescLists = new ArrayList<FileSinkDesc>();


        for (Operator<? extends OperatorDesc> parent : parents) {
          FileSinkDesc fileSinkDesc = (FileSinkDesc) fileSinkOp.getConf().clone();


          String dirName = parentDirName + Path.SEPARATOR + parent.getIdentifier() ;
          fileSinkDesc.setDirName(dirName);
          fileSinkDesc.setLinkedFileSink(true);
          fileSinkDesc.setParentDir(parentDirName);

View Full Code Here

    }


    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FileSinkOperator fileSinkOp   = (FileSinkOperator)nd;


      // Has this filesink already been processed
      if (fileSinkOp.getConf().isLinkedFileSink()) {
        return null;
      }


      int size = stack.size();
      int pos = size - 2;

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      BucketingSortingCtx bctx = (BucketingSortingCtx)procCtx;
      FileSinkOperator fop = (FileSinkOperator)nd;


      Operator<? extends OperatorDesc> parent = getParent(stack);
      List<BucketCol> bucketCols = bctx.getBucketedCols(parent);
      List<ColumnInfo> colInfos = fop.getSchema().getSignature();


      // Set the inferred bucket columns for the file this FileSink produces
      if (bucketCols != null) {
        List<BucketCol> newBucketCols = getNewBucketCols(bucketCols, colInfos);
        bctx.getBucketedColsByDirectory().put(fop.getConf().getDirName(), newBucketCols);
        bctx.setBucketedCols(fop, newBucketCols);
      }


      List<SortCol> sortCols = bctx.getSortedCols(parent);


      // Set the inferred sort columns for the file this FileSink produces
      if (sortCols != null) {
        List<SortCol> newSortCols = getNewSortCols(sortCols, colInfos);
        bctx.getSortedColsByDirectory().put(fop.getConf().getDirName(), newSortCols);
        bctx.setSortedCols(fop, newSortCols);
      }


      return null;
    }

View Full Code Here

      Object... nodeOutputs) throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
    fsOp.getConf().getTableInfo().getTableName() != null &&
        parseCtx.getQB().getParseInfo().isInsertToTable();
    HiveConf hconf = parseCtx.getConf();


    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork)currTask.getWork()).setFinalMapRed(true);


    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
      Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
      processLinkedFileDesc(ctx, childTask);
      return null;
    }


    // Has the user enabled merging of files for map-only jobs or for all jobs
    if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) {
      List<Task<MoveWork>> mvTasks = ctx.getMvTask();


      // In case of unions or map-joins, it is possible that the file has
      // already been seen.
      // So, no need to attempt to merge the files again.
      if ((ctx.getSeenFileSinkOps() == null)
          || (!ctx.getSeenFileSinkOps().contains(nd))) {


        // no need of merging if the move is to a local file system
        MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp);


        if (isInsertTable && hconf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER)) {
          addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf());
        }


        if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) {
          if (fsOp.getConf().isLinkedFileSink()) {
            // If the user has HIVEMERGEMAPREDFILES set to false, the idea was the
            // number of reducers are few, so the number of files anyway are small.
            // However, with this optimization, we are increasing the number of files
            // possibly by a big margin. So, merge aggresively.
            if (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) ||
                hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES)) {
              chDir = true;
            }
          } else {
              // There are separate configuration parameters to control whether to
              // merge for a map-only job
              // or for a map-reduce job
              MapredWork currWork = (MapredWork) currTask.getWork();
              boolean mergeMapOnly =
                  hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && currWork.getReducer() == null;
              boolean mergeMapRed =
                  hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) &&
                      currWork.getReducer() != null;
              if (mergeMapOnly || mergeMapRed) {
                chDir = true;
              }
          }
        }
      }
    }


    String finalName = processFS(fsOp, stack, opProcCtx, chDir);


    if (chDir) {
      // Merge the files in the destination table/partitions by creating Map-only merge job
      // If underlying data is RCFile a RCFileBlockMerge task would be created.
      LOG.info("using CombineHiveInputformat for the merge job");
      createMRWorkForMergingFiles(fsOp, ctx, finalName);
    }


    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
      Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks =
        ctx.getLinkedFileDescTasks();
      if (linkedFileDescTasks == null) {
        linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.FileSinkOperator

org.apache.hadoop.fs.FileStatus

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.ql.io.FSRecordWriter

org.apache.hadoop.hive.ql.optimizer.BucketingSortingReduceSinkOptimizer$BucketSortReduceSinkProcessor

org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory$ConstantPropagateFileSinkProc

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1

org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingOpProcFactory$FileSinkInferrer

org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinResolver$CommonJoinTaskDispatcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.