Examples of org.apache.hadoop.hive.ql.exec.MapJoinOperator

org.apache.hadoop.hive.ql.exec.MapJoinOperator
Map side Join operator implementation.

        Map.Entry<JoinOperator, QBJoinTree> joinEntry = joinCtxIter.next();
        JoinOperator joinOp = joinEntry.getKey();
        QBJoinTree qbJoin = joinEntry.getValue();
        int mapJoinPos = mapSideJoin(joinOp, qbJoin);
        if (mapJoinPos >= 0) {
          MapJoinOperator mapJoinOp = generateMapJoinOperator(pactx, joinOp, qbJoin, mapJoinPos);
          listMapJoinOps.add(mapJoinOp);
          mapJoinMap.put(mapJoinOp, qbJoin);
        } else {
          joinMap.put(joinOp, qbJoin);
        }

View Full Code Here

   */
  @Override
  public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs)
      throws SemanticException {
    GenTezProcContext context = (GenTezProcContext) procContext;
    MapJoinOperator mapJoinOp = (MapJoinOperator)nd;


    if (stack.size() < 2 || !(stack.get(stack.size() - 2) instanceof ReduceSinkOperator)) {
      context.currentMapJoinOperators.add(mapJoinOp);
      return null;
    }


    context.preceedingWork = null;
    context.currentRootOperator = null;


    ReduceSinkOperator parentRS = (ReduceSinkOperator)stack.get(stack.size() - 2);


    // remember the original parent list before we start modifying it.
    if (!context.mapJoinParentMap.containsKey(mapJoinOp)) {
      List<Operator<?>> parents = new ArrayList(mapJoinOp.getParentOperators());
      context.mapJoinParentMap.put(mapJoinOp, parents);
    }


    List<BaseWork> mapJoinWork = null;


    /*
     *  if there was a pre-existing work generated for the big-table mapjoin side,
     *  we need to hook the work generated for the RS (associated with the RS-MJ pattern)
     *  with the pre-existing work.
     *
     *  Otherwise, we need to associate that the mapjoin op
     *  to be linked to the RS work (associated with the RS-MJ pattern).
     *
     */
    mapJoinWork = context.mapJoinWorkMap.get(mapJoinOp);
    BaseWork parentWork;
    if (context.unionWorkMap.containsKey(parentRS)) {
      parentWork = context.unionWorkMap.get(parentRS);
    } else {
      assert context.childToWorkMap.get(parentRS).size() == 1;
      parentWork = context.childToWorkMap.get(parentRS).get(0);
    }


    // set the link between mapjoin and parent vertex
    int pos = context.mapJoinParentMap.get(mapJoinOp).indexOf(parentRS);
    if (pos == -1) {
      throw new SemanticException("Cannot find position of parent in mapjoin");
    }
    LOG.debug("Mapjoin "+mapJoinOp+", pos: "+pos+" --> "+parentWork.getName());
    mapJoinOp.getConf().getParentToInput().put(pos, parentWork.getName());


    int numBuckets = -1;
    EdgeType edgeType = EdgeType.BROADCAST_EDGE;
    if (mapJoinOp.getConf().isBucketMapJoin()) {
      numBuckets = (Integer) mapJoinOp.getConf().getBigTableBucketNumMapping().values().toArray()[0];
      if (mapJoinOp.getConf().getCustomBucketMapJoin()) {
        edgeType = EdgeType.CUSTOM_EDGE;
      } else {
        edgeType = EdgeType.CUSTOM_SIMPLE_EDGE;
      }
    }
    TezEdgeProperty edgeProp = new TezEdgeProperty(null, edgeType, numBuckets);


    if (mapJoinWork != null) {
      for (BaseWork myWork: mapJoinWork) {
        // link the work with the work associated with the reduce sink that triggered this rule
        TezWork tezWork = context.currentTask.getWork();
        LOG.debug("connecting "+parentWork.getName()+" with "+myWork.getName());
        tezWork.connect(parentWork, myWork, edgeProp);
        
        ReduceSinkOperator r = null;
        if (parentRS.getConf().getOutputName() != null) {
          LOG.debug("Cloning reduce sink for multi-child broadcast edge");
          // we've already set this one up. Need to clone for the next work.
          r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(
              (ReduceSinkDesc) parentRS.getConf().clone(), parentRS.getParentOperators());
          context.clonedReduceSinks.add(r);
        } else {
          r = parentRS;
        }
        // remember the output name of the reduce sink
        r.getConf().setOutputName(myWork.getName());
        context.connectedReduceSinks.add(r);
      }
    }


    // remember in case we need to connect additional work later
    Map<BaseWork, TezEdgeProperty> linkWorkMap = null;
    if (context.linkOpWithWorkMap.containsKey(mapJoinOp)) {
      linkWorkMap = context.linkOpWithWorkMap.get(mapJoinOp);
    } else {
      linkWorkMap = new HashMap<BaseWork, TezEdgeProperty>();
    }
    linkWorkMap.put(parentWork, edgeProp);
    context.linkOpWithWorkMap.put(mapJoinOp, linkWorkMap);
    
    List<ReduceSinkOperator> reduceSinks 
      = context.linkWorkWithReduceSinkMap.get(parentWork);
    if (reduceSinks == null) {
      reduceSinks = new ArrayList<ReduceSinkOperator>();
    }
    reduceSinks.add(parentRS);
    context.linkWorkWithReduceSinkMap.put(parentWork, reduceSinks);


    // create the dummy operators
    List<Operator<? extends OperatorDesc>> dummyOperators =
        new ArrayList<Operator<? extends OperatorDesc>>();


    // create an new operator: HashTableDummyOperator, which share the table desc
    HashTableDummyDesc desc = new HashTableDummyDesc();
    @SuppressWarnings("unchecked")
    HashTableDummyOperator dummyOp = (HashTableDummyOperator) OperatorFactory.get(desc);
    TableDesc tbl;


    // need to create the correct table descriptor for key/value
    RowSchema rowSchema = parentRS.getParentOperators().get(0).getSchema();
    tbl = PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rowSchema, ""));
    dummyOp.getConf().setTbl(tbl);


    Map<Byte, List<ExprNodeDesc>> keyExprMap = mapJoinOp.getConf().getKeys();
    List<ExprNodeDesc> keyCols = keyExprMap.get(Byte.valueOf((byte) 0));
    StringBuffer keyOrder = new StringBuffer();
    for (ExprNodeDesc k: keyCols) {
      keyOrder.append("+");
    }
    TableDesc keyTableDesc = PlanUtils.getReduceKeyTableDesc(PlanUtils
        .getFieldSchemasFromColumnList(keyCols, "mapjoinkey"), keyOrder.toString());
    mapJoinOp.getConf().setKeyTableDesc(keyTableDesc);


    // let the dummy op be the parent of mapjoin op
    mapJoinOp.replaceParent(parentRS, dummyOp);
    List<Operator<? extends OperatorDesc>> dummyChildren =
      new ArrayList<Operator<? extends OperatorDesc>>();
    dummyChildren.add(mapJoinOp);
    dummyOp.setChildOperators(dummyChildren);
    dummyOperators.add(dummyOp);

View Full Code Here

      }


      newPlan.setMapLocalWork(localPlan);


      // construct a map join and set it as the child operator of tblScan_op
      MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory
          .getAndMakeChild(mapJoinDescriptor, (RowSchema) null, parentOps);
      // change the children of the original join operator to point to the map
      // join operator
      List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp
          .getChildOperators();
      for (Operator<? extends OperatorDesc> childOp : childOps) {
        childOp.replaceParent(cloneJoinOp, mapJoinOp);
      }
      mapJoinOp.setChildOperators(childOps);


      HiveConf jc = new HiveConf(parseCtx.getConf(),
          GenMRSkewJoinProcessor.class);


      newPlan.setNumMapTasks(HiveConf

View Full Code Here

      Object... nodeOutputs) throws SemanticException {
    if (nd instanceof SMBMapJoinOperator) {
      return null;
    }


    MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
    SortBucketJoinProcCtx smbJoinContext = (SortBucketJoinProcCtx) procCtx;


    boolean convert =
        canConvertBucketMapJoinToSMBJoin(mapJoinOp, stack, smbJoinContext, nodeOutputs);

View Full Code Here

      int pos = 0; // it doesn't matter which position we use in this case.
      convertJoinSMBJoin(joinOp, context, pos, 0, false, false);
      return null;
    }


    MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversionPos);
    // map join operator by default has no bucket cols
    mapJoinOp.setOpTraits(new OpTraits(null, -1, null));
    mapJoinOp.setStatistics(joinOp.getStatistics());
    // propagate this change till the next RS
    for (Operator<? extends OperatorDesc> childOp : mapJoinOp.getChildOperators()) {
      setAllChildrenTraitsToNull(childOp);
    }


    return null;
  }

View Full Code Here

    if (!checkConvertJoinBucketMapJoin(joinOp, context, bigTablePosition, tezBucketJoinProcCtx)) {
      LOG.info("Check conversion to bucket map join failed.");
      return false;
    }


    MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, bigTablePosition);
    MapJoinDesc joinDesc = mapJoinOp.getConf();
    joinDesc.setBucketMapJoin(true);


    // we can set the traits for this join operator
    OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(),
        tezBucketJoinProcCtx.getNumBuckets(), null);
    mapJoinOp.setOpTraits(opTraits);
    mapJoinOp.setStatistics(joinOp.getStatistics());
    setNumberOfBucketsOnChildren(mapJoinOp);


    // Once the conversion is done, we can set the partitioner to bucket cols on the small table
    Map<String, Integer> bigTableBucketNumMapping = new HashMap<String, Integer>();
    bigTableBucketNumMapping.put(joinDesc.getBigTableAlias(), tezBucketJoinProcCtx.getNumBuckets());

View Full Code Here

      }
    }


    //can safely convert the join to a map join.
    ParseContext parseContext = context.parseContext;
    MapJoinOperator mapJoinOp =
        MapJoinProcessor.convertJoinOpMapJoinOp(context.conf, parseContext.getOpParseCtx(), joinOp,
            parseContext.getJoinContext().get(joinOp), bigTablePosition, true);


    Operator<? extends OperatorDesc> parentBigTableOp =
        mapJoinOp.getParentOperators().get(bigTablePosition);
    if (parentBigTableOp instanceof ReduceSinkOperator) {
      for (Operator<?> p : parentBigTableOp.getParentOperators()) {
        // we might have generated a dynamic partition operator chain. Since
        // we're removing the reduce sink we need do remove that too.
        Set<Operator<?>> dynamicPartitionOperators = new HashSet<Operator<?>>();
        for (Operator<?> c : p.getChildOperators()) {
          if (hasDynamicPartitionBroadcast(c)) {
            dynamicPartitionOperators.add(c);
          }
        }
        for (Operator<?> c : dynamicPartitionOperators) {
          p.removeChild(c);
        }
      }
      mapJoinOp.getParentOperators().remove(bigTablePosition);
      if (!(mapJoinOp.getParentOperators().contains(parentBigTableOp.getParentOperators().get(0)))) {
        mapJoinOp.getParentOperators().add(bigTablePosition,
            parentBigTableOp.getParentOperators().get(0));
      }
      parentBigTableOp.getParentOperators().get(0).removeChild(parentBigTableOp);
      for (Operator<? extends OperatorDesc> op : mapJoinOp.getParentOperators()) {
        if (!(op.getChildOperators().contains(mapJoinOp))) {
          op.getChildOperators().add(mapJoinOp);
        }
        op.getChildOperators().remove(joinOp);
      }

View Full Code Here

  // Convert the join operator to a bucket map-join join operator
  protected MapJoinOperator convertJoinToBucketMapJoin(
    JoinOperator joinOp,
    SortBucketJoinProcCtx joinContext,
    ParseContext parseContext) throws SemanticException {
    MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin(
      parseContext.getConf(),
      parseContext.getOpParseCtx(),
      joinOp,
      pGraphContext.getJoinContext().get(joinOp),
      joinContext.getBigTablePosition(),

View Full Code Here

  // Convert the join operator to a sort-merge join operator
  protected void convertJoinToSMBJoin(
    JoinOperator joinOp,
    SortBucketJoinProcCtx smbJoinContext,
    ParseContext parseContext) throws SemanticException {
    MapJoinOperator mapJoinOp = convertJoinToBucketMapJoin(joinOp, smbJoinContext, parseContext);
    SMBMapJoinOperator smbMapJoinOp =
        convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext, parseContext);
    smbMapJoinOp.setConvertedAutomaticallySMBJoin(true);
  }

View Full Code Here

   */
  @Override
  public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs)
      throws SemanticException {
    GenTezProcContext context = (GenTezProcContext) procContext;
    MapJoinOperator mapJoinOp = (MapJoinOperator)nd;


    if (stack.size() < 2 || !(stack.get(stack.size() - 2) instanceof ReduceSinkOperator)) {
      context.currentMapJoinOperators.add(mapJoinOp);
      return null;
    }


    context.preceedingWork = null;
    context.currentRootOperator = null;


    ReduceSinkOperator parentRS = (ReduceSinkOperator)stack.get(stack.size() - 2);
    // remove the tag for in-memory side of mapjoin
    parentRS.getConf().setSkipTag(true);
    parentRS.setSkipTag(true);
    // remember the original parent list before we start modifying it.
    if (!context.mapJoinParentMap.containsKey(mapJoinOp)) {
      List<Operator<?>> parents = new ArrayList(mapJoinOp.getParentOperators());
      context.mapJoinParentMap.put(mapJoinOp, parents);
    }


    List<BaseWork> mapJoinWork = null;


    /*
     *  if there was a pre-existing work generated for the big-table mapjoin side,
     *  we need to hook the work generated for the RS (associated with the RS-MJ pattern)
     *  with the pre-existing work.
     *
     *  Otherwise, we need to associate that the mapjoin op
     *  to be linked to the RS work (associated with the RS-MJ pattern).
     *
     */
    mapJoinWork = context.mapJoinWorkMap.get(mapJoinOp);
    BaseWork parentWork;
    if (context.unionWorkMap.containsKey(parentRS)) {
      parentWork = context.unionWorkMap.get(parentRS);
    } else {
      assert context.childToWorkMap.get(parentRS).size() == 1;
      parentWork = context.childToWorkMap.get(parentRS).get(0);
    }


    // set the link between mapjoin and parent vertex
    int pos = context.mapJoinParentMap.get(mapJoinOp).indexOf(parentRS);
    if (pos == -1) {
      throw new SemanticException("Cannot find position of parent in mapjoin");
    }
    MapJoinDesc joinConf = mapJoinOp.getConf();
    long keyCount = Long.MAX_VALUE, rowCount = Long.MAX_VALUE, bucketCount = 1;
    Statistics stats = parentRS.getStatistics();
    if (stats != null) {
      keyCount = rowCount = stats.getNumRows();
      if (keyCount <= 0) {
        keyCount = rowCount = Long.MAX_VALUE;
      }
      ArrayList<String> keyCols = parentRS.getConf().getOutputKeyColumnNames();
      if (keyCols != null && !keyCols.isEmpty()) {
        // See if we can arrive at a smaller number using distinct stats from key columns.
        long maxKeyCount = 1;
        String prefix = Utilities.ReduceField.KEY.toString();
        for (String keyCol : keyCols) {
          ExprNodeDesc realCol = parentRS.getColumnExprMap().get(prefix + "." + keyCol);
          ColStatistics cs =
              StatsUtils.getColStatisticsFromExpression(context.conf, stats, realCol);
          if (cs == null || cs.getCountDistint() <= 0) {
            maxKeyCount = Long.MAX_VALUE;
            break;
          }
          maxKeyCount *= cs.getCountDistint();
          if (maxKeyCount >= keyCount) {
            break;
          }
        }
        keyCount = Math.min(maxKeyCount, keyCount);
      }
      if (joinConf.isBucketMapJoin()) {
        OpTraits opTraits = mapJoinOp.getOpTraits();
        bucketCount = (opTraits == null) ? -1 : opTraits.getNumBuckets();
        if (bucketCount > 0) {
          // We cannot obtain a better estimate without CustomPartitionVertex providing it
          // to us somehow; in which case using statistics would be completely unnecessary.
          keyCount /= bucketCount;
        }
      }
    }
    LOG.info("Mapjoin " + mapJoinOp + ", pos: " + pos + " --> " + parentWork.getName() + " ("
      + keyCount + " keys estimated from " + rowCount + " rows, " + bucketCount + " buckets)");
    joinConf.getParentToInput().put(pos, parentWork.getName());
    if (keyCount != Long.MAX_VALUE) {
      joinConf.getParentKeyCounts().put(pos, keyCount);
    }


    int numBuckets = -1;
    EdgeType edgeType = EdgeType.BROADCAST_EDGE;
    if (joinConf.isBucketMapJoin()) {


      // disable auto parallelism for bucket map joins
      parentRS.getConf().setReducerTraits(EnumSet.of(FIXED));


      numBuckets = (Integer) joinConf.getBigTableBucketNumMapping().values().toArray()[0];
      if (joinConf.getCustomBucketMapJoin()) {
        edgeType = EdgeType.CUSTOM_EDGE;
      } else {
        edgeType = EdgeType.CUSTOM_SIMPLE_EDGE;
      }
    }
    TezEdgeProperty edgeProp = new TezEdgeProperty(null, edgeType, numBuckets);


    if (mapJoinWork != null) {
      for (BaseWork myWork: mapJoinWork) {
        // link the work with the work associated with the reduce sink that triggered this rule
        TezWork tezWork = context.currentTask.getWork();
        LOG.debug("connecting "+parentWork.getName()+" with "+myWork.getName());
        tezWork.connect(parentWork, myWork, edgeProp);
        if (edgeType == EdgeType.CUSTOM_EDGE) {
          tezWork.setVertexType(myWork, VertexType.INITIALIZED_EDGES);
        }


        ReduceSinkOperator r = null;
        if (parentRS.getConf().getOutputName() != null) {
          LOG.debug("Cloning reduce sink for multi-child broadcast edge");
          // we've already set this one up. Need to clone for the next work.
          r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(
              (ReduceSinkDesc) parentRS.getConf().clone(), parentRS.getParentOperators());
          context.clonedReduceSinks.add(r);
        } else {
          r = parentRS;
        }
        // remember the output name of the reduce sink
        r.getConf().setOutputName(myWork.getName());
        context.connectedReduceSinks.add(r);
      }
    }


    // remember in case we need to connect additional work later
    Map<BaseWork, TezEdgeProperty> linkWorkMap = null;
    if (context.linkOpWithWorkMap.containsKey(mapJoinOp)) {
      linkWorkMap = context.linkOpWithWorkMap.get(mapJoinOp);
    } else {
      linkWorkMap = new HashMap<BaseWork, TezEdgeProperty>();
    }
    linkWorkMap.put(parentWork, edgeProp);
    context.linkOpWithWorkMap.put(mapJoinOp, linkWorkMap);
    
    List<ReduceSinkOperator> reduceSinks 
      = context.linkWorkWithReduceSinkMap.get(parentWork);
    if (reduceSinks == null) {
      reduceSinks = new ArrayList<ReduceSinkOperator>();
    }
    reduceSinks.add(parentRS);
    context.linkWorkWithReduceSinkMap.put(parentWork, reduceSinks);


    // create the dummy operators
    List<Operator<?>> dummyOperators = new ArrayList<Operator<?>>();


    // create an new operator: HashTableDummyOperator, which share the table desc
    HashTableDummyDesc desc = new HashTableDummyDesc();
    @SuppressWarnings("unchecked")
    HashTableDummyOperator dummyOp = (HashTableDummyOperator) OperatorFactory.get(desc);
    TableDesc tbl;


    // need to create the correct table descriptor for key/value
    RowSchema rowSchema = parentRS.getParentOperators().get(0).getSchema();
    tbl = PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rowSchema, ""));
    dummyOp.getConf().setTbl(tbl);


    Map<Byte, List<ExprNodeDesc>> keyExprMap = mapJoinOp.getConf().getKeys();
    List<ExprNodeDesc> keyCols = keyExprMap.get(Byte.valueOf((byte) 0));
    StringBuffer keyOrder = new StringBuffer();
    for (ExprNodeDesc k: keyCols) {
      keyOrder.append("+");
    }
    TableDesc keyTableDesc = PlanUtils.getReduceKeyTableDesc(PlanUtils
        .getFieldSchemasFromColumnList(keyCols, "mapjoinkey"), keyOrder.toString());
    mapJoinOp.getConf().setKeyTableDesc(keyTableDesc);


    // let the dummy op be the parent of mapjoin op
    mapJoinOp.replaceParent(parentRS, dummyOp);
    List<Operator<? extends OperatorDesc>> dummyChildren =
      new ArrayList<Operator<? extends OperatorDesc>>();
    dummyChildren.add(mapJoinOp);
    dummyOp.setChildOperators(dummyChildren);
    dummyOperators.add(dummyOp);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.MapJoinOperator

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.ql.exec.HashTableSinkOperator.HashTableSinkObjectCtx

org.apache.hadoop.hive.ql.exec.persistence.AbstractMapJoinKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue

org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer

org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.