Examples of org.apache.hadoop.hive.ql.plan.MapJoinDesc

org.apache.hadoop.hive.ql.plan.MapJoinDesc
Map Join operator Descriptor implementation.


    List<String> outputColumnNames = op.getConf().getOutputColumnNames();
    TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf,
        PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
    JoinCondDesc[] joinCondns = op.getConf().getConds();
    MapJoinDesc mapJoinDescriptor = new MapJoinDesc(keyExprMap, keyTableDesc, newValueExprs,
        valueTableDescs, valueFiltedTableDescs, outputColumnNames, mapJoinPos, joinCondns,
        filters, op.getConf().getNoOuterJoin(), dumpFilePrefix);
    mapJoinDescriptor.setStatistics(op.getConf().getStatistics());
    mapJoinDescriptor.setTagOrder(tagOrder);
    mapJoinDescriptor.setNullSafes(desc.getNullSafes());
    mapJoinDescriptor.setFilterMap(desc.getFilterMap());


    // reduce sink row resolver used to generate map join op
    RowResolver outputRS = opParseCtxMap.get(op).getRowResolver();


    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(

View Full Code Here

    // Create a new map join operator
    SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
    List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
    TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils
        .getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
    MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(),
        keyTableDesc, smbJoinDesc.getExprs(),
        smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(),
        smbJoinDesc.getOutputColumnNames(),
        bigTablePos, smbJoinDesc.getConds(),
        smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix());


    mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());


    RowResolver joinRS = opParseCtxMap.get(smbJoinOp).getRowResolver();
    // The mapjoin has the same schema as the join operator
    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(
        mapJoinDesc, joinRS.getRowSchema(),

View Full Code Here

   * enhanced to keep the big table bucket -> small table buckets mapping.
   */
  protected void convertMapJoinToBucketMapJoin(
      MapJoinOperator mapJoinOp,
      BucketJoinProcCtx context) throws SemanticException {
    MapJoinDesc desc = mapJoinOp.getConf();


    Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, Map<String, List<String>>>();


    Map<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition =
        context.getTblAliasToNumberOfBucketsInEachPartition();


    Map<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition =
        context.getTblAliasToBucketedFilePathsInEachPartition();


    Map<Partition, List<String>> bigTblPartsToBucketFileNames =
        context.getBigTblPartsToBucketFileNames();


    Map<Partition, Integer> bigTblPartsToBucketNumber =
        context.getBigTblPartsToBucketNumber();


    List<String> joinAliases = context.getJoinAliases();
    String baseBigAlias = context.getBaseBigAlias();


    // sort bucket names for the big table
    for (List<String> partBucketNames : bigTblPartsToBucketFileNames.values()) {
      Collections.sort(partBucketNames);
    }


    // go through all small tables and get the mapping from bucket file name
    // in the big table to bucket file names in small tables.
    for (int j = 0; j < joinAliases.size(); j++) {
      String alias = joinAliases.get(j);
      if (alias.equals(baseBigAlias)) {
        continue;
      }
      for (List<String> names : tblAliasToBucketedFilePathsInEachPartition.get(alias)) {
        Collections.sort(names);
      }
      List<Integer> smallTblBucketNums = tblAliasToNumberOfBucketsInEachPartition.get(alias);
      List<List<String>> smallTblFilesList = tblAliasToBucketedFilePathsInEachPartition.get(alias);


      Map<String, List<String>> mappingBigTableBucketFileNameToSmallTableBucketFileNames =
          new LinkedHashMap<String, List<String>>();
      aliasBucketFileNameMapping.put(alias,
          mappingBigTableBucketFileNameToSmallTableBucketFileNames);


      // for each bucket file in big table, get the corresponding bucket file
      // name in the small table.
      // more than 1 partition in the big table, do the mapping for each partition
      Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames =
          bigTblPartsToBucketFileNames.entrySet().iterator();
      Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
          .entrySet().iterator();
      while (bigTblPartToBucketNames.hasNext()) {
        assert bigTblPartToBucketNum.hasNext();
        int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
        List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
        fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(smallTblBucketNums,
            smallTblFilesList,
            mappingBigTableBucketFileNameToSmallTableBucketFileNames, bigTblBucketNum,
            bigTblBucketNameList,
            desc.getBigTableBucketNumMapping());
      }
    }
    desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
    desc.setBigTableAlias(baseBigAlias);
    boolean bigTablePartitioned = context.isBigTablePartitioned();
    if (bigTablePartitioned) {
      desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
    }


    // successfully convert to bucket map join
    desc.setBucketMapJoin(true);
  }

View Full Code Here

      return false;
    }


    MapJoinOperator mapJoinOp = 
      convertJoinMapJoin(joinOp, context, bigTablePosition);
    MapJoinDesc joinDesc = mapJoinOp.getConf();
    joinDesc.setBucketMapJoin(true);


    // we can set the traits for this join operator
    OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(),
        tezBucketJoinProcCtx.getNumBuckets());
    mapJoinOp.setOpTraits(opTraits);
    setNumberOfBucketsOnChildren(mapJoinOp);


    // Once the conversion is done, we can set the partitioner to bucket cols on the small table    
    Map<String, Integer> bigTableBucketNumMapping = new HashMap<String, Integer>();
    bigTableBucketNumMapping.put(joinDesc.getBigTableAlias(), tezBucketJoinProcCtx.getNumBuckets());
    joinDesc.setBigTableBucketNumMapping(bigTableBucketNumMapping);
    LOG.info("Setting legacy map join to " + (!tezBucketJoinProcCtx.isSubQuery()));
    joinDesc.setCustomBucketMapJoin(!tezBucketJoinProcCtx.isSubQuery());


    return true;
  }

View Full Code Here


  public VectorMapJoinOperator (VectorizationContext vContext, OperatorDesc conf)
    throws HiveException {
    this();


    MapJoinDesc desc = (MapJoinDesc) conf;
    this.conf = desc;


    order = desc.getTagOrder();
    numAliases = desc.getExprs().size();
    posBigTable = (byte) desc.getPosBigTable();
    filterMaps = desc.getFilterMap();
    tagLen = desc.getTagLength();
    noOuterJoin = desc.isNoOuterJoin();


    Map<Byte, List<ExprNodeDesc>> filterExpressions = desc.getFilters();
    bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable),
        VectorExpressionDescriptor.Mode.FILTER);


    List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable);
    keyExpressions = vContext.getVectorExpressions(keyDesc);


    // We're only going to evaluate the big table vectorized expressions,
    Map<Byte, List<ExprNodeDesc>> exprs = desc.getExprs();
    bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable));


    List<String> outColNames = desc.getOutputColumnNames();
    
    Map<String, Integer> mapOutCols = new HashMap<String, Integer>(outColNames.size());
    
    int outColIndex = 0;
    for(String outCol: outColNames) {
      mapOutCols.put(outCol,  outColIndex++);
    }
    
    vOutContext = new VectorizationContext(mapOutCols, outColIndex);
    vOutContext.setFileKey(vContext.getFileKey() + "/MAP_JOIN_" + desc.getBigTableAlias());
    this.fileKey = vOutContext.getFileKey();
  }

View Full Code Here

  // parent reduce sinks
  private void convertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context,
      int mapJoinConversionPos, int numBuckets, boolean isSubQuery, boolean adjustParentsChildren)
      throws SemanticException {
    ParseContext parseContext = context.parseContext;
    MapJoinDesc mapJoinDesc = null;
    if (adjustParentsChildren) {
        mapJoinDesc = MapJoinProcessor.getMapJoinDesc(context.conf, parseContext.getOpParseCtx(),
            joinOp, parseContext.getJoinContext().get(joinOp), mapJoinConversionPos, true);
    } else {
      JoinDesc joinDesc = joinOp.getConf();
      // retain the original join desc in the map join.
      mapJoinDesc =
          new MapJoinDesc(null, null, joinDesc.getExprs(), null, null,
              joinDesc.getOutputColumnNames(), mapJoinConversionPos, joinDesc.getConds(),
              joinDesc.getFilters(), joinDesc.getNoOuterJoin(), null);
      mapJoinDesc.setNullSafes(joinDesc.getNullSafes());
      mapJoinDesc.setFilterMap(joinDesc.getFilterMap());
      mapJoinDesc.resetOrder();
    }


    @SuppressWarnings("unchecked")
    CommonMergeJoinOperator mergeJoinOp =
        (CommonMergeJoinOperator) OperatorFactory.get(new CommonMergeJoinDesc(numBuckets,

View Full Code Here

      LOG.info("Check conversion to bucket map join failed.");
      return false;
    }


    MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, bigTablePosition);
    MapJoinDesc joinDesc = mapJoinOp.getConf();
    joinDesc.setBucketMapJoin(true);


    // we can set the traits for this join operator
    OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(),
        tezBucketJoinProcCtx.getNumBuckets(), null);
    mapJoinOp.setOpTraits(opTraits);
    mapJoinOp.setStatistics(joinOp.getStatistics());
    setNumberOfBucketsOnChildren(mapJoinOp);


    // Once the conversion is done, we can set the partitioner to bucket cols on the small table
    Map<String, Integer> bigTableBucketNumMapping = new HashMap<String, Integer>();
    bigTableBucketNumMapping.put(joinDesc.getBigTableAlias(), tezBucketJoinProcCtx.getNumBuckets());
    joinDesc.setBigTableBucketNumMapping(bigTableBucketNumMapping);
    LOG.info("Setting legacy map join to " + (!tezBucketJoinProcCtx.isSubQuery()));
    joinDesc.setCustomBucketMapJoin(!tezBucketJoinProcCtx.isSubQuery());


    return true;
  }

View Full Code Here

    // set the link between mapjoin and parent vertex
    int pos = context.mapJoinParentMap.get(mapJoinOp).indexOf(parentRS);
    if (pos == -1) {
      throw new SemanticException("Cannot find position of parent in mapjoin");
    }
    MapJoinDesc joinConf = mapJoinOp.getConf();
    long keyCount = Long.MAX_VALUE, rowCount = Long.MAX_VALUE, bucketCount = 1;
    Statistics stats = parentRS.getStatistics();
    if (stats != null) {
      keyCount = rowCount = stats.getNumRows();
      if (keyCount <= 0) {
        keyCount = rowCount = Long.MAX_VALUE;
      }
      ArrayList<String> keyCols = parentRS.getConf().getOutputKeyColumnNames();
      if (keyCols != null && !keyCols.isEmpty()) {
        // See if we can arrive at a smaller number using distinct stats from key columns.
        long maxKeyCount = 1;
        String prefix = Utilities.ReduceField.KEY.toString();
        for (String keyCol : keyCols) {
          ExprNodeDesc realCol = parentRS.getColumnExprMap().get(prefix + "." + keyCol);
          ColStatistics cs =
              StatsUtils.getColStatisticsFromExpression(context.conf, stats, realCol);
          if (cs == null || cs.getCountDistint() <= 0) {
            maxKeyCount = Long.MAX_VALUE;
            break;
          }
          maxKeyCount *= cs.getCountDistint();
          if (maxKeyCount >= keyCount) {
            break;
          }
        }
        keyCount = Math.min(maxKeyCount, keyCount);
      }
      if (joinConf.isBucketMapJoin()) {
        OpTraits opTraits = mapJoinOp.getOpTraits();
        bucketCount = (opTraits == null) ? -1 : opTraits.getNumBuckets();
        if (bucketCount > 0) {
          // We cannot obtain a better estimate without CustomPartitionVertex providing it
          // to us somehow; in which case using statistics would be completely unnecessary.
          keyCount /= bucketCount;
        }
      }
    }
    LOG.info("Mapjoin " + mapJoinOp + ", pos: " + pos + " --> " + parentWork.getName() + " ("
      + keyCount + " keys estimated from " + rowCount + " rows, " + bucketCount + " buckets)");
    joinConf.getParentToInput().put(pos, parentWork.getName());
    if (keyCount != Long.MAX_VALUE) {
      joinConf.getParentKeyCounts().put(pos, keyCount);
    }


    int numBuckets = -1;
    EdgeType edgeType = EdgeType.BROADCAST_EDGE;
    if (joinConf.isBucketMapJoin()) {


      // disable auto parallelism for bucket map joins
      parentRS.getConf().setReducerTraits(EnumSet.of(FIXED));


      numBuckets = (Integer) joinConf.getBigTableBucketNumMapping().values().toArray()[0];
      if (joinConf.getCustomBucketMapJoin()) {
        edgeType = EdgeType.CUSTOM_EDGE;
      } else {
        edgeType = EdgeType.CUSTOM_SIMPLE_EDGE;
      }
    }

View Full Code Here

   * enhanced to keep the big table bucket -> small table buckets mapping.
   */
  protected void convertMapJoinToBucketMapJoin(
      MapJoinOperator mapJoinOp,
      BucketJoinProcCtx context) throws SemanticException {
    MapJoinDesc desc = mapJoinOp.getConf();


    Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, Map<String, List<String>>>();


    Map<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition =
        context.getTblAliasToNumberOfBucketsInEachPartition();


    Map<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition =
        context.getTblAliasToBucketedFilePathsInEachPartition();


    Map<Partition, List<String>> bigTblPartsToBucketFileNames =
        context.getBigTblPartsToBucketFileNames();


    Map<Partition, Integer> bigTblPartsToBucketNumber =
        context.getBigTblPartsToBucketNumber();


    List<String> joinAliases = context.getJoinAliases();
    String baseBigAlias = context.getBaseBigAlias();


    // sort bucket names for the big table
    for (List<String> partBucketNames : bigTblPartsToBucketFileNames.values()) {
      Collections.sort(partBucketNames);
    }


    // go through all small tables and get the mapping from bucket file name
    // in the big table to bucket file names in small tables.
    for (int j = 0; j < joinAliases.size(); j++) {
      String alias = joinAliases.get(j);
      if (alias.equals(baseBigAlias)) {
        continue;
      }
      for (List<String> names : tblAliasToBucketedFilePathsInEachPartition.get(alias)) {
        Collections.sort(names);
      }
      List<Integer> smallTblBucketNums = tblAliasToNumberOfBucketsInEachPartition.get(alias);
      List<List<String>> smallTblFilesList = tblAliasToBucketedFilePathsInEachPartition.get(alias);


      Map<String, List<String>> mappingBigTableBucketFileNameToSmallTableBucketFileNames =
          new LinkedHashMap<String, List<String>>();
      aliasBucketFileNameMapping.put(alias,
          mappingBigTableBucketFileNameToSmallTableBucketFileNames);


      // for each bucket file in big table, get the corresponding bucket file
      // name in the small table.
      // more than 1 partition in the big table, do the mapping for each partition
      Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames =
          bigTblPartsToBucketFileNames.entrySet().iterator();
      Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
          .entrySet().iterator();
      while (bigTblPartToBucketNames.hasNext()) {
        assert bigTblPartToBucketNum.hasNext();
        int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
        List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
        fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(smallTblBucketNums,
            smallTblFilesList,
            mappingBigTableBucketFileNameToSmallTableBucketFileNames, bigTblBucketNum,
            bigTblBucketNameList,
            desc.getBigTableBucketNumMapping());
      }
    }
    desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
    desc.setBigTableAlias(baseBigAlias);
    boolean bigTablePartitioned = context.isBigTablePartitioned();
    if (bigTablePartitioned) {
      desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
    }


    // successfully convert to bucket map join
    desc.setBucketMapJoin(true);
  }

View Full Code Here


  public VectorMapJoinOperator (VectorizationContext vContext, OperatorDesc conf)
    throws HiveException {
    this();


    MapJoinDesc desc = (MapJoinDesc) conf;
    this.conf = desc;


    order = desc.getTagOrder();
    numAliases = desc.getExprs().size();
    posBigTable = (byte) desc.getPosBigTable();
    filterMaps = desc.getFilterMap();
    noOuterJoin = desc.isNoOuterJoin();


    Map<Byte, List<ExprNodeDesc>> filterExpressions = desc.getFilters();
    bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable),
        VectorExpressionDescriptor.Mode.FILTER);


    List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable);
    keyExpressions = vContext.getVectorExpressions(keyDesc);


    // We're only going to evaluate the big table vectorized expressions,
    Map<Byte, List<ExprNodeDesc>> exprs = desc.getExprs();
    bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable));


    // We are making a new output vectorized row batch.
    vOutContext = new VectorizationContext(desc.getOutputColumnNames());
    vOutContext.setFileKey(vContext.getFileKey() + "/MAP_JOIN_" + desc.getBigTableAlias());
  }

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.MapJoinDesc

org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator

org.apache.hadoop.hive.ql.optimizer.AbstractBucketJoinProc

org.apache.hadoop.hive.ql.optimizer.BucketMapJoinOptimizer$BucketMapjoinOptProc

org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.MapJoinFactory$TableScanMapJoinProcessor

org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor

org.apache.hadoop.hive.ql.optimizer.physical.CrossProductCheck$MapJoinCheck

org.apache.hadoop.hive.ql.optimizer.physical.GenMRSkewJoinProcessor

org.apache.hadoop.hive.ql.optimizer.physical.LocalMapJoinProcFactory$LocalMapJoinProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.