Package org.apache.hadoop.hive.ql.exec

Examples of org.apache.hadoop.hive.ql.exec.FileSinkOperator


        // FileSink cannot be simply cloned - it requires some special processing.
        // Sub-queries for the union will be processed as independent map-reduce jobs
        // possibly running in parallel. Those sub-queries cannot write to the same
        // directory. Clone the filesink, but create a sub-directory in the final path
        // for each sub-query. Also, these different filesinks need to be linked to each other
        FileSinkOperator fileSinkOp = (FileSinkOperator)stack.get(pos);
        // For file sink operator, change the directory name
        Path parentDirName = fileSinkOp.getConf().getDirName();

        // Clone the fileSinkDesc of the final fileSink and create similar fileSinks at
        // each parent
        List<FileSinkDesc> fileDescLists = new ArrayList<FileSinkDesc>();

        for (Operator<? extends OperatorDesc> parent : parents) {
          FileSinkDesc fileSinkDesc = (FileSinkDesc) fileSinkOp.getConf().clone();
          fileSinkDesc.setDirName(new Path(parentDirName, parent.getIdentifier()));
          fileSinkDesc.setLinkedFileSink(true);
          fileSinkDesc.setParentDir(parentDirName);
          parent.setChildOperators(null);
          Operator<? extends OperatorDesc> tmpFileSinkOp =
View Full Code Here


    }

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FileSinkOperator fileSinkOp   = (FileSinkOperator)nd;

      // Has this filesink already been processed
      if (fileSinkOp.getConf().isLinkedFileSink()) {
        return null;
      }

      int size = stack.size();
      int pos = size - 2;
View Full Code Here

    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    ctx.addRootIfPossible(currTask);

    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
        GenMapRedUtils.isInsertInto(parseCtx, fsOp);
    HiveConf hconf = parseCtx.getConf();

    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork)currTask.getWork()).setFinalMapRed(true);

    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
      Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
      processLinkedFileDesc(ctx, childTask);
      return true;
    }

    // In case of unions or map-joins, it is possible that the file has
    // already been seen.
    // So, no need to attempt to merge the files again.
    if ((ctx.getSeenFileSinkOps() == null)
        || (!ctx.getSeenFileSinkOps().contains(nd))) {
      chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
    }

    Path finalName = processFS(fsOp, stack, opProcCtx, chDir);

    if (chDir) {
      // Merge the files in the destination table/partitions by creating Map-only merge job
      // If underlying data is RCFile a RCFileBlockMerge task would be created.
      LOG.info("using CombineHiveInputformat for the merge job");
      GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName,
          ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(),
          hconf, currTask);
    }

    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
      Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks =
        ctx.getLinkedFileDescTasks();
      if (linkedFileDescTasks == null) {
        linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();
View Full Code Here

  public Object process(Node nd, Stack<Node> stack,
      NodeProcessorCtx procCtx, Object... nodeOutputs)
      throws SemanticException {

    GenTezProcContext context = (GenTezProcContext) procCtx;
    FileSinkOperator fileSink = (FileSinkOperator) nd;

    // just remember it for later processing
    context.fileSinkSet.add(fileSink);
    return true;
  }
View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      BucketingSortingCtx bctx = (BucketingSortingCtx)procCtx;
      FileSinkOperator fop = (FileSinkOperator)nd;

      Operator<? extends OperatorDesc> parent = getParent(stack);
      List<BucketCol> bucketCols = bctx.getBucketedCols(parent);
      List<ColumnInfo> colInfos = fop.getSchema().getSignature();

      // Set the inferred bucket columns for the file this FileSink produces
      if (bucketCols != null) {
        List<BucketCol> newBucketCols = getNewBucketCols(bucketCols, colInfos);
        bctx.getBucketedColsByDirectory().put(fop.getConf().getDirName().toString(), newBucketCols);
        bctx.setBucketedCols(fop, newBucketCols);
      }

      List<SortCol> sortCols = bctx.getSortedCols(parent);

      // Set the inferred sort columns for the file this FileSink produces
      if (sortCols != null) {
        List<SortCol> newSortCols = getNewSortCols(sortCols, colInfos);
        bctx.getSortedColsByDirectory().put(fop.getConf().getDirName().toString(), newSortCols);
        bctx.setSortedCols(fop, newSortCols);
      }

      return null;
    }
View Full Code Here

    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts,
      conf.getBoolVar(ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(
      fsOutputDesc, inputRS, tsMerge);

    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
View Full Code Here

        mapWork.setAliasToWork(map);
        return;
      }
    });

    fs = new FileSinkOperator();
    fs.setConf(new FileSinkDesc());
    rs = new ReduceSinkOperator();
    rs.setConf(new ReduceSinkDesc());
    ts = new TableScanOperator();
    ts.setConf(new TableScanDesc());
View Full Code Here

    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // introduce RS and EX before FS. If the operator tree already contains
      // RS then ReduceSinkDeDuplication optimization should merge them
      FileSinkOperator fsOp = (FileSinkOperator) nd;

      LOG.info("Sorted dynamic partitioning optimization kicked in..");

      // if not dynamic partitioning then bail out
      if (fsOp.getConf().getDynPartCtx() == null) {
        LOG.debug("Bailing out of sort dynamic partition optimization as dynamic partitioning context is null");
        return null;
      }

      // if list bucketing then bail out
      ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx();
      if (lbCtx != null && !lbCtx.getSkewedColNames().isEmpty()
          && !lbCtx.getSkewedColValues().isEmpty()) {
        LOG.debug("Bailing out of sort dynamic partition optimization as list bucketing is enabled");
        return null;
      }

      Table destTable = parseCtx.getFsopToTable().get(fsOp);
      if (destTable == null) {
        LOG.debug("Bailing out of sort dynamic partition optimization as destination table is null");
        return null;
      }

      // if RS is inserted by enforce bucketing or sorting, we need to remove it
      // since ReduceSinkDeDuplication will not merge them to single RS.
      // RS inserted by enforce bucketing/sorting will have bucketing column in
      // reduce sink key whereas RS inserted by this optimization will have
      // partition columns followed by bucket number followed by sort columns in
      // the reduce sink key. Since both key columns are not prefix subset
      // ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs.
      // To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting.
      if (!removeRSInsertedByEnforceBucketing(fsOp)) {
        LOG.debug("Bailing out of sort dynamic partition optimization as some partition columns " +
            "got constant folded.");
        return null;
      }

      // unlink connection between FS and its parent
      Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0);
      fsParent.getChildOperators().clear();

      DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx();
      int numBuckets = destTable.getNumBuckets();

      // if enforce bucketing/sorting is disabled numBuckets will not be set.
      // set the number of buckets here to ensure creation of empty buckets
      dpCtx.setNumBuckets(numBuckets);

      // Get the positions for partition, bucket and sort columns
      List<Integer> bucketPositions = getBucketPositions(destTable.getBucketCols(),
          destTable.getCols());
      ObjectPair<List<Integer>, List<Integer>> sortOrderPositions = getSortPositionsOrder(
          destTable.getSortCols(), destTable.getCols());
      List<Integer> sortPositions = null;
      List<Integer> sortOrder = null;
      if (fsOp.getConf().getWriteType() == AcidUtils.Operation.UPDATE ||
          fsOp.getConf().getWriteType() == AcidUtils.Operation.DELETE) {
        // When doing updates and deletes we always want to sort on the rowid because the ACID
        // reader will expect this sort order when doing reads.  So
        // ignore whatever comes from the table and enforce this sort order instead.
        sortPositions = Arrays.asList(0);
        sortOrder = Arrays.asList(1); // 1 means asc, could really use enum here in the thrift if
      } else {
        sortPositions = sortOrderPositions.getFirst();
        sortOrder = sortOrderPositions.getSecond();
      }
      LOG.debug("Got sort order");
      for (int i : sortPositions) LOG.debug("sort position " + i);
      for (int i : sortOrder) LOG.debug("sort order " + i);
      List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema());
      List<ColumnInfo> colInfos = parseCtx.getOpParseCtx().get(fsParent).getRowResolver()
          .getColumnInfos();
      ArrayList<ExprNodeDesc> bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos);

      // update file sink descriptor
      fsOp.getConf().setMultiFileSpray(false);
      fsOp.getConf().setNumFiles(1);
      fsOp.getConf().setTotalFiles(1);

      // Create ReduceSinkDesc
      RowResolver inputRR = parseCtx.getOpParseCtx().get(fsParent).getRowResolver();
      ObjectPair<String, RowResolver> pair = copyRowResolver(inputRR);
      RowResolver outRR = pair.getSecond();
      ArrayList<ColumnInfo> valColInfo = Lists.newArrayList(fsParent.getSchema().getSignature());
      ArrayList<ExprNodeDesc> newValueCols = Lists.newArrayList();
      Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap();
      for (ColumnInfo ci : valColInfo) {
        newValueCols.add(new ExprNodeColumnDesc(ci));
        colExprMap.put(ci.getInternalName(), newValueCols.get(newValueCols.size() - 1));
      }
      ReduceSinkDesc rsConf = getReduceSinkDesc(partitionPositions, sortPositions, sortOrder,
          newValueCols, bucketColumns, numBuckets, fsParent, fsOp.getConf().getWriteType());

      if (!bucketColumns.isEmpty()) {
        String tableAlias = outRR.getColumnInfos().get(0).getTabAlias();
        ColumnInfo ci = new ColumnInfo(BUCKET_NUMBER_COL_NAME, TypeInfoFactory.stringTypeInfo,
            tableAlias, true, true);
        outRR.put(tableAlias, BUCKET_NUMBER_COL_NAME, ci);
      }

      // Create ReduceSink operator
      ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(
          OperatorFactory.getAndMakeChild(rsConf, new RowSchema(outRR.getColumnInfos()), fsParent),
          outRR, parseCtx);
      rsOp.setColumnExprMap(colExprMap);

      // Create ExtractDesc
      ObjectPair<String, RowResolver> exPair = copyRowResolver(outRR);
      RowResolver exRR = exPair.getSecond();
      ExtractDesc exConf = new ExtractDesc(new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
          Utilities.ReduceField.VALUE.toString(), "", false));

      // Create Extract Operator
      ExtractOperator exOp = (ExtractOperator) putOpInsertMap(
          OperatorFactory.getAndMakeChild(exConf, new RowSchema(exRR.getColumnInfos()), rsOp),
          exRR, parseCtx);

      // link EX to FS
      fsOp.getParentOperators().clear();
      fsOp.getParentOperators().add(exOp);
      exOp.getChildOperators().add(fsOp);

      // Set if partition sorted or partition bucket sorted
      fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED);
      if (bucketColumns.size() > 0) {
        fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED);
      }

      // update partition column info in FS descriptor
      ArrayList<ExprNodeDesc> partitionColumns = getPositionsToExprNodes(partitionPositions, rsOp
          .getSchema().getSignature());
      fsOp.getConf().setPartitionCols(partitionColumns);

      LOG.info("Inserted " + rsOp.getOperatorId() + " and " + exOp.getOperatorId()
          + " as parent of " + fsOp.getOperatorId() + " and child of " + fsParent.getOperatorId());
      return null;
    }
View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // If the reduce sink has not been introduced due to bucketing/sorting, ignore it
      FileSinkOperator fsOp = (FileSinkOperator) nd;
      ExtractOperator exOp = (ExtractOperator) fsOp.getParentOperators().get(0);
      ReduceSinkOperator rsOp = (ReduceSinkOperator) exOp.getParentOperators().get(0);

      List<ReduceSinkOperator> rsOps = pGraphContext
          .getReduceSinkOperatorsAddedByEnforceBucketingSorting();
      // nothing to do
      if ((rsOps != null) && (!rsOps.contains(rsOp))) {
        return null;
      }

      // Don't do this optimization with updates or deletes
      if (pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.UPDATE ||
          pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.DELETE){
        return null;
      }

      // Support for dynamic partitions can be added later
      if (fsOp.getConf().getDynPartCtx() != null) {
        return null;
      }

      // No conversion is possible for the reduce keys
      for (ExprNodeDesc keyCol : rsOp.getConf().getKeyCols()) {
View Full Code Here

   * @param cols The list of columns.
   */
  public void setLineage(Path dir, DataContainer dc,
      List<FieldSchema> cols) {
    // First lookup the file sink operator from the load work.
    FileSinkOperator fop = dirToFop.get(dir);

    // Go over the associated fields and look up the dependencies
    // by position in the row schema of the filesink operator.
    if (fop == null) {
      return;
    }

    List<ColumnInfo> signature = fop.getSchema().getSignature();
    int i = 0;
    for (FieldSchema fs : cols) {
      linfo.putDependency(dc, fs, index.getDependency(fop, signature.get(i++)));
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.FileSinkOperator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.