Examples of org.apache.hadoop.hive.ql.exec.MapJoinOperator

org.apache.hadoop.hive.ql.exec.MapJoinOperator
Map side Join operator implementation.

      opTaskMap.put(null, currTask);
      GenMapRedUtils.initUnionPlan(ctx, currTask, false);
      return dest;
    }
    
    MapJoinOperator currMapJoinOp = ctx.getCurrMapJoinOp();
    
    if  (currMapJoinOp != null) {
      opTaskMap.put(null, currTask);
      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp);
      mapredWork plan = (mapredWork) currTask.getWork();

View Full Code Here

    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();


    // The mapjoin has already been encountered. Some context must be stored about that
    if (readInputMapJoin) {
      MapJoinOperator currMapJoinOp = opProcCtx.getCurrMapJoinOp();
      assert currMapJoinOp != null;
      boolean local = ((pos == -1) || (pos == ((mapJoinDesc)currMapJoinOp.getConf()).getPosBigTable())) ? false : true;


      if (setReducer) {
        Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
        plan.setReducer(reducer);
        opTaskMap.put(reducer, currTask);

View Full Code Here

      }
      currTopOp = null;
      opProcCtx.setCurrTopOp(currTopOp);
    }
    else if (opProcCtx.getCurrMapJoinOp() != null) {
      MapJoinOperator mjOp  = opProcCtx.getCurrMapJoinOp();
      if (readUnionData) {
        initUnionPlan(opProcCtx, currTask, false);
      }
      else {
        GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp);


        // In case of map-join followed by map-join, the file needs to be obtained from the old map join
        MapJoinOperator oldMapJoin = mjCtx.getOldMapJoin();
        String          taskTmpDir = null;
        tableDesc       tt_desc    = null;
        Operator<? extends Serializable> rootOp = null;


        if (oldMapJoin == null) {

View Full Code Here

    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, ts_op, cplan, local, tt_desc);


    // This can be cleaned up as a function table in future
    if (op instanceof MapJoinOperator) {
      MapJoinOperator mjOp = (MapJoinOperator)op;
      opProcCtx.setCurrMapJoinOp(mjOp);
      GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp);
      if (mjCtx == null)
        mjCtx = new GenMRMapJoinCtx(taskTmpDir, tt_desc, ts_op, null);
      else {

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      MapJoinWalkerCtx ctx = (MapJoinWalkerCtx) procCtx;
      MapJoinOperator mapJoin = (MapJoinOperator) nd;
      if (ctx.getListRejectedMapJoins() != null && !ctx.getListRejectedMapJoins().contains(mapJoin)) {
        // for rule: MapJoin%.*MapJoin
        // have a child mapjoin. if the the current mapjoin is on a local work,
        // will put the current mapjoin in the rejected list.
        Boolean bigBranch = findGrandChildSubqueryMapjoin(ctx, mapJoin);

View Full Code Here

      this.pGraphContext = pGraphContext;
    }


    private boolean convertBucketMapJoin(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
      BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
      HiveConf conf = context.getConf();


      if(context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
        return false;
      }


      QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
      if(joinCxt == null) {
        return false;
      }


      List<String> joinAliases = new ArrayList<String>();
      String[] srcs = joinCxt.getBaseSrc();
      String[] left = joinCxt.getLeftAliases();
      List<String> mapAlias = joinCxt.getMapAliases();
      String baseBigAlias = null;
      for(String s : left) {
        if(s != null && !joinAliases.contains(s)) {
          joinAliases.add(s);
          if(!mapAlias.contains(s)) {
            baseBigAlias = s;
          }
        }
      }
      for(String s : srcs) {
        if(s != null && !joinAliases.contains(s)) {
          joinAliases.add(s);
          if(!mapAlias.contains(s)) {
            baseBigAlias = s;
          }
        }
      }


      MapJoinDesc mjDesc = mapJoinOp.getConf();
      LinkedHashMap<String, List<Integer>> aliasToPartitionBucketNumberMapping =
          new LinkedHashMap<String, List<Integer>>();
      LinkedHashMap<String, List<List<String>>> aliasToPartitionBucketFileNamesMapping =
          new LinkedHashMap<String, List<List<String>>>();


      Map<String, Operator<? extends OperatorDesc>> topOps =
        this.pGraphContext.getTopOps();
      Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();


      // (partition to bucket file names) and (partition to bucket number) for
      // the big table;
      LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
      LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();


      Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
      boolean bigTablePartitioned = true;
      for (int index = 0; index < joinAliases.size(); index++) {
        String alias = joinAliases.get(index);
        TableScanOperator tso = (TableScanOperator) topOps.get(alias);
        if (tso == null) {
          return false;
        }
        List<String> keys = toColumns(mjDesc.getKeys().get((byte) index));
        if (keys == null || keys.isEmpty()) {
          return false;
        }
        if (orders == null) {
          orders = new Integer[keys.size()];
        }


        Table tbl = topToTable.get(tso);
        if(tbl.isPartitioned()) {
          PrunedPartitionList prunedParts;
          try {
            prunedParts = pGraphContext.getOpToPartList().get(tso);
            if (prunedParts == null) {
              prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
                pGraphContext.getPrunedPartitions());
              pGraphContext.getOpToPartList().put(tso, prunedParts);
            }
          } catch (HiveException e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
            throw new SemanticException(e.getMessage(), e);
          }
          List<Partition> partitions = prunedParts.getNotDeniedPartns();
          // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
          if (partitions.isEmpty()) {
            if (!alias.equals(baseBigAlias)) {
              aliasToPartitionBucketNumberMapping.put(alias, Arrays.<Integer>asList());
              aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList<List<String>>());
            }
          } else {
            List<Integer> buckets = new ArrayList<Integer>();
            List<List<String>> files = new ArrayList<List<String>>();
            for (Partition p : partitions) {
              if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
                return false;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p.getDataLocation());
              // The number of files for the table should be same as number of buckets.
              int bucketCount = p.getBucketCount();
              if (fileNames.size() != bucketCount) {
                String msg = "The number of buckets for table " +
                  tbl.getTableName() + " partition " + p.getName() + " is " +
                  p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
                throw new SemanticException(
                  ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
              }
              if (alias.equals(baseBigAlias)) {
                bigTblPartsToBucketFileNames.put(p, fileNames);
                bigTblPartsToBucketNumber.put(p, bucketCount);
              } else {
                files.add(fileNames);
                buckets.add(bucketCount);
              }
            }
            if (!alias.equals(baseBigAlias)) {
              aliasToPartitionBucketNumberMapping.put(alias, buckets);
              aliasToPartitionBucketFileNamesMapping.put(alias, files);
            }
          }
        } else {
          if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
            return false;
          }
          List<String> fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation());
          Integer num = new Integer(tbl.getNumBuckets());
          // The number of files for the table should be same as number of buckets.
          if (fileNames.size() != num) {
            String msg = "The number of buckets for table " +
              tbl.getTableName() + " is " + tbl.getNumBuckets() +
              ", whereas the number of files is " + fileNames.size();
            throw new SemanticException(
              ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
          }
          if (alias.equals(baseBigAlias)) {
            bigTblPartsToBucketFileNames.put(null, fileNames);
            bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
            bigTablePartitioned = false;
          } else {
            aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
            aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
          }
        }
      }


      // All tables or partitions are bucketed, and their bucket number is
      // stored in 'bucketNumbers', we need to check if the number of buckets in
      // the big table can be divided by no of buckets in small tables.
      for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
        if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
          return false;
        }
      }


      MapJoinDesc desc = mapJoinOp.getConf();


      Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, Map<String, List<String>>>();


      //sort bucket names for the big table

View Full Code Here

   * The Node Processor for Column Pruning on Map Join Operators.
   */
  public static class ColumnPrunerMapJoinProc implements NodeProcessor {
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      MapJoinOperator op = (MapJoinOperator) nd;
      pruneJoinOperator(ctx, op, op.getConf(), op.getColumnExprMap(), op
          .getConf().getRetainList(), true);
      return null;
    }

View Full Code Here

    return new NodeProcessor () {
      @Override
      public Object process(Node nd, Stack<Node> stack,
          NodeProcessorCtx procCtx, Object... nodeOutputs)
          throws SemanticException {
        MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
        BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
        context.listOfRejectedMapjoins.add(mapJoinOp);
        return null;
      }
    };

View Full Code Here

    try {
      LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtxMap =
        newWork.getOpParseCtxMap();
      QBJoinTree newJoinTree = newWork.getJoinTree();
      // generate the map join operator; already checked the map join
      MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(opParseCtxMap, op,
          newJoinTree, mapJoinPos, true);
      // generate the local work and return the big table alias
      String bigTableAlias = MapJoinProcessor
          .genMapJoinLocalWork(newWork, newMapJoinOp, mapJoinPos);
      // clean up the mapred work

View Full Code Here

        filters, op.getConf().getNoOuterJoin(), dumpFilePrefix);
    mapJoinDescriptor.setTagOrder(tagOrder);
    mapJoinDescriptor.setNullSafes(desc.getNullSafes());
    mapJoinDescriptor.setFilterMap(desc.getFilterMap());


    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(
        mapJoinDescriptor, new RowSchema(outputRS.getColumnInfos()), newPar);


    OpParseContext ctx = new OpParseContext(outputRS);
    opParseCtxMap.put(mapJoinOp, ctx);


    mapJoinOp.getConf().setReversedExprs(op.getConf().getReversedExprs());
    mapJoinOp.setColumnExprMap(colExprMap);


    // change the children of the original join operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> childOps = op.getChildOperators();
    for (Operator<? extends OperatorDesc> childOp : childOps) {
      childOp.replaceParent(op, mapJoinOp);
    }


    mapJoinOp.setChildOperators(childOps);
    mapJoinOp.setParentOperators(newParentOps);
    op.setChildOperators(null);
    op.setParentOperators(null);


    return mapJoinOp;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.MapJoinOperator

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.ql.exec.HashTableSinkOperator.HashTableSinkObjectCtx

org.apache.hadoop.hive.ql.exec.persistence.AbstractMapJoinKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue

org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer

org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.