Examples of org.apache.hadoop.hive.ql.plan.MapJoinDesc

org.apache.hadoop.hive.ql.plan.mapJoinDesc
Map Join operator Descriptor implementation.

      Operator<? extends OperatorDesc> reducer = clonePlan.getReducer();
      assert reducer instanceof JoinOperator;
      JoinOperator cloneJoinOp = (JoinOperator) reducer;


      String dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
      MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc,
          newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc,joinDescriptor
          .getOutputColumnNames(), i, joinDescriptor.getConds(),
          joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
      mapJoinDescriptor.setTagOrder(tags);
      mapJoinDescriptor.setHandleSkewJoin(false);
      mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());


      MapredLocalWork localPlan = new MapredLocalWork(
          new LinkedHashMap<String, Operator<? extends OperatorDesc>>(),
          new LinkedHashMap<String, FetchWork>());
      Map<Byte, String> smallTblDirs = smallKeysDirMap.get(src);

View Full Code Here

      MapredWork plan = (MapredWork) currTask.getWork();
      HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap =
          opProcCtx.getOpTaskMap();
      Operator<? extends OperatorDesc> currTopOp = opProcCtx.getCurrTopOp();


      MapJoinDesc desc = (MapJoinDesc) op.getConf();


      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);


      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      if(!rootTasks.contains(currTask) 
         && (currTask.getParentTasks() == null 
             || currTask.getParentTasks().isEmpty())) {
        rootTasks.add(currTask);
      }


      assert currTopOp != null;
      opProcCtx.getSeenOps().add(currTopOp);


      String currAliasId = opProcCtx.getCurrAliasId();
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
      setupBucketMapJoinInfo(plan, op);
    }

View Full Code Here

   * enhanced to keep the big table bucket -> small table buckets mapping.
   */
  protected void convertMapJoinToBucketMapJoin(
      MapJoinOperator mapJoinOp,
      BucketJoinProcCtx context) throws SemanticException {
    MapJoinDesc desc = mapJoinOp.getConf();


    Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, Map<String, List<String>>>();


    Map<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition =
        context.getTblAliasToNumberOfBucketsInEachPartition();


    Map<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition =
        context.getTblAliasToBucketedFilePathsInEachPartition();


    Map<Partition, List<String>> bigTblPartsToBucketFileNames =
        context.getBigTblPartsToBucketFileNames();


    Map<Partition, Integer> bigTblPartsToBucketNumber =
        context.getBigTblPartsToBucketNumber();


    List<String> joinAliases = context.getJoinAliases();
    String baseBigAlias = context.getBaseBigAlias();


    // sort bucket names for the big table
    for (List<String> partBucketNames : bigTblPartsToBucketFileNames.values()) {
      Collections.sort(partBucketNames);
    }


    // go through all small tables and get the mapping from bucket file name
    // in the big table to bucket file names in small tables.
    for (int j = 0; j < joinAliases.size(); j++) {
      String alias = joinAliases.get(j);
      if (alias.equals(baseBigAlias)) {
        continue;
      }
      for (List<String> names : tblAliasToBucketedFilePathsInEachPartition.get(alias)) {
        Collections.sort(names);
      }
      List<Integer> smallTblBucketNums = tblAliasToNumberOfBucketsInEachPartition.get(alias);
      List<List<String>> smallTblFilesList = tblAliasToBucketedFilePathsInEachPartition.get(alias);


      Map<String, List<String>> mappingBigTableBucketFileNameToSmallTableBucketFileNames =
          new LinkedHashMap<String, List<String>>();
      aliasBucketFileNameMapping.put(alias,
          mappingBigTableBucketFileNameToSmallTableBucketFileNames);


      // for each bucket file in big table, get the corresponding bucket file
      // name in the small table.
      // more than 1 partition in the big table, do the mapping for each partition
      Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames =
          bigTblPartsToBucketFileNames.entrySet().iterator();
      Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
          .entrySet().iterator();
      while (bigTblPartToBucketNames.hasNext()) {
        assert bigTblPartToBucketNum.hasNext();
        int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
        List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
        fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(smallTblBucketNums,
            smallTblFilesList,
            mappingBigTableBucketFileNameToSmallTableBucketFileNames, bigTblBucketNum,
            bigTblBucketNameList,
            desc.getBigTableBucketNumMapping());
      }
    }
    desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
    desc.setBigTableAlias(baseBigAlias);
    boolean bigTablePartitioned = context.isBigTablePartitioned();
    if (bigTablePartitioned) {
      desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
    }


    // successfully convert to bucket map join
    desc.setBucketMapJoin(true);
  }

View Full Code Here

          .getFieldSchemasFromColumnList(valueFilteredCols, "mapjoinvalue"));


      valueTableDescs.add(valueTableDesc);
      valueFiltedTableDescs.add(valueFilteredTableDesc);
    }
    MapJoinDesc mapJoinDescriptor = new MapJoinDesc(keyExprMap, keyTableDesc, valueExprMap,
        valueTableDescs, valueFiltedTableDescs, outputColumnNames, mapJoinPos, joinCondns,
        filterMap, op.getConf().getNoOuterJoin());
    mapJoinDescriptor.setTagOrder(tagOrder);


    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(
        mapJoinDescriptor, new RowSchema(outputRS.getColumnInfos()), newPar);


    OpParseContext ctx = new OpParseContext(outputRS);

View Full Code Here

        initUnionPlan(opProcCtx, currTask, false);
      }


      opProcCtx.setCurrMapJoinOp(null);
    } else {
      MapJoinDesc desc = (MapJoinDesc) op.getConf();


      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);


      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      rootTasks.add(currTask);


      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();


      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
      setupBucketMapJoinInfo(plan, (AbstractMapJoinOperator<? extends MapJoinDesc>)op, createLocalPlan);
    }


    opProcCtx.setCurrTask(currTask);

View Full Code Here

            baseBigAlias = s;
          }
        }
      }


      MapJoinDesc mjDecs = mapJoinOp.getConf();
      LinkedHashMap<String, Integer> aliasToBucketNumberMapping = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, List<String>> aliasToBucketFileNamesMapping = new LinkedHashMap<String, List<String>>();
      // right now this code does not work with "a join b on a.key = b.key and
      // a.ds = b.ds", where ds is a partition column. It only works with joins
      // with only one partition presents in each join source tables.
      Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext.getTopOps();
      Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();


      // (partition to bucket file names) and (partition to bucket number) for
      // the big table;
      LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
      LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();


      for (int index = 0; index < joinAliases.size(); index++) {
        String alias = joinAliases.get(index);
        TableScanOperator tso = (TableScanOperator) topOps.get(alias);
        if (tso == null) {
          return null;
        }
        Table tbl = topToTable.get(tso);
        if(tbl.isPartitioned()) {
          PrunedPartitionList prunedParts = null;
          try {
            prunedParts = pGraphContext.getOpToPartList().get(tso);
            if (prunedParts == null) {
              prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
                pGraphContext.getPrunedPartitions());
              pGraphContext.getOpToPartList().put(tso, prunedParts);
            }
          } catch (HiveException e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
            throw new SemanticException(e.getMessage(), e);
          }
          int partNumber = prunedParts.getConfirmedPartns().size()
              + prunedParts.getUnknownPartns().size();


          if (partNumber > 1) {
            // only allow one partition for small tables
            if(alias != baseBigAlias) {
              return null;
            }
            // here is the big table,and we get more than one partitions.
            // construct a mapping of (Partition->bucket file names) and
            // (Partition -> bucket number)
            Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                .iterator();
            while (iter.hasNext()) {
              Partition p = iter.next();
              if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
                return null;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p);
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, p.getBucketCount());
            }
            iter = prunedParts.getUnknownPartns().iterator();
            while (iter.hasNext()) {
              Partition p = iter.next();
              if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
                return null;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p);
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, p.getBucketCount());
            }
            // If there are more than one partition for the big
            // table,aliasToBucketFileNamesMapping and partsToBucketNumber will
            // not contain mappings for the big table. Instead, the mappings are
            // contained in bigTblPartsToBucketFileNames and
            // bigTblPartsToBucketNumber


          } else {
            Partition part = null;
            Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                .iterator();
            if (iter.hasNext()) {
              part = iter.next();              
            }
            if (part == null) {
              iter = prunedParts.getUnknownPartns().iterator();
              if (iter.hasNext()) {
                part = iter.next();              
              }
            }
            assert part != null;
            Integer num = new Integer(part.getBucketCount());
            aliasToBucketNumberMapping.put(alias, num);
            if (!checkBucketColumns(part.getBucketCols(), mjDecs, index)) {
              return null;
            }
            List<String> fileNames = getOnePartitionBucketFileNames(part);
            aliasToBucketFileNamesMapping.put(alias, fileNames);
            if (alias == baseBigAlias) {
              bigTblPartsToBucketFileNames.put(part, fileNames);
              bigTblPartsToBucketNumber.put(part, num);
            }
          }
        } else {
          if (!checkBucketColumns(tbl.getBucketCols(), mjDecs, index)) {
            return null;
          }
          Integer num = new Integer(tbl.getNumBuckets());
          aliasToBucketNumberMapping.put(alias, num);
          List<String> fileNames = new ArrayList<String>();
          try {
            FileSystem fs = FileSystem.get(tbl.getDataLocation(), this.pGraphContext.getConf());
            FileStatus[] files = fs.listStatus(new Path(tbl.getDataLocation().toString()));
            if(files != null) {
              for(FileStatus file : files) {
                fileNames.add(file.getPath().toString());
              }
            }
          } catch (IOException e) {
            throw new SemanticException(e);
          }
          aliasToBucketFileNamesMapping.put(alias, fileNames);
        }
      }


      // All tables or partitions are bucketed, and their bucket number is
      // stored in 'bucketNumbers', we need to check if the number of buckets in
      // the big table can be divided by no of buckets in small tables.
      if (bigTblPartsToBucketNumber.size() > 0) {
        Iterator<Entry<Partition, Integer>> bigTblPartToBucketNumber = bigTblPartsToBucketNumber
            .entrySet().iterator();
        while (bigTblPartToBucketNumber.hasNext()) {
          int bucketNumberInPart = bigTblPartToBucketNumber.next().getValue();
          if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
              bucketNumberInPart)) {
            return null;
          }
        }
      } else {
        int bucketNoInBigTbl = aliasToBucketNumberMapping.get(baseBigAlias).intValue();
        if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
            bucketNoInBigTbl)) {
          return null;
        }
      }


      MapJoinDesc desc = mapJoinOp.getConf();


      LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>>();


      //sort bucket names for the big table
      if(bigTblPartsToBucketNumber.size() > 0) {
        Collection<List<String>> bucketNamesAllParts = bigTblPartsToBucketFileNames.values();
        for(List<String> partBucketNames : bucketNamesAllParts) {
          Collections.sort(partBucketNames);
        }
      } else {
        Collections.sort(aliasToBucketFileNamesMapping.get(baseBigAlias));
      }


      // go through all small tables and get the mapping from bucket file name
      // in the big table to bucket file names in small tables.
      for (int j = 0; j < joinAliases.size(); j++) {
        String alias = joinAliases.get(j);
        if(alias.equals(baseBigAlias)) {
          continue;
        }
        Collections.sort(aliasToBucketFileNamesMapping.get(alias));
        LinkedHashMap<String, ArrayList<String>> mapping = new LinkedHashMap<String, ArrayList<String>>();
        aliasBucketFileNameMapping.put(alias, mapping);


        // for each bucket file in big table, get the corresponding bucket file
        // name in the small table.
        if (bigTblPartsToBucketNumber.size() > 0) {
          //more than 1 partition in the big table, do the mapping for each partition
          Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames = bigTblPartsToBucketFileNames
              .entrySet().iterator();
          Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
              .entrySet().iterator();
          while (bigTblPartToBucketNames.hasNext()) {
            assert bigTblPartToBucketNum.hasNext();
            int bigTblBucketNum = bigTblPartToBucketNum.next().getValue().intValue();
            List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
            fillMapping(baseBigAlias, aliasToBucketNumberMapping,
                aliasToBucketFileNamesMapping, alias, mapping, bigTblBucketNum,
                bigTblBucketNameList, desc.getBucketFileNameMapping());
          }
        } else {
          List<String> bigTblBucketNameList = aliasToBucketFileNamesMapping.get(baseBigAlias);
          int bigTblBucketNum =  aliasToBucketNumberMapping.get(baseBigAlias);
          fillMapping(baseBigAlias, aliasToBucketNumberMapping,
              aliasToBucketFileNamesMapping, alias, mapping, bigTblBucketNum,
              bigTblBucketNameList, desc.getBucketFileNameMapping());
        }
      }
      desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
      desc.setBigTableAlias(baseBigAlias);
      return null;
    }

View Full Code Here


      Operator<? extends Serializable> reducer = clonePlan.getReducer();
      assert reducer instanceof JoinOperator;
      JoinOperator cloneJoinOp = (JoinOperator) reducer;


      MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc,
          newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc,joinDescriptor
          .getOutputColumnNames(), i, joinDescriptor.getConds(),
          joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin());
      mapJoinDescriptor.setTagOrder(tags);
      mapJoinDescriptor.setHandleSkewJoin(false);


      MapredLocalWork localPlan = new MapredLocalWork(
          new LinkedHashMap<String, Operator<? extends Serializable>>(),
          new LinkedHashMap<String, FetchWork>());
      Map<Byte, String> smallTblDirs = smallKeysDirMap.get(src);

View Full Code Here

      }


      opProcCtx.setCurrMapJoinOp(null);
    }
    else {
      mapJoinDesc desc = (mapJoinDesc)op.getConf();


      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);


      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      rootTasks.add(currTask);


      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();


      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
    }


    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(null);

View Full Code Here

    
      valueTableDescs.add(valueTableDesc);
    }
      
    MapJoinOperator mapJoinOp = (MapJoinOperator)putOpInsertMap(OperatorFactory.getAndMakeChild(
      new mapJoinDesc(keyExprMap, keyTableDesc, valueExprMap, valueTableDescs, outputColumnNames, mapJoinPos, joinCondns),
      new RowSchema(outputRS.getColumnInfos()), newPar), outputRS);
    
    mapJoinOp.getConf().setReversedExprs(op.getConf().getReversedExprs());
    mapJoinOp.setColumnExprMap(colExprMap);

View Full Code Here

      }
        
      opProcCtx.setCurrMapJoinOp(null);
    }
    else {
      mapJoinDesc desc = (mapJoinDesc)op.getConf();


      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);
      
      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      rootTasks.add(currTask);
      
      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();
      
      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
    }


    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(null);

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.MapJoinDesc

org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator

org.apache.hadoop.hive.ql.optimizer.AbstractBucketJoinProc

org.apache.hadoop.hive.ql.optimizer.BucketMapJoinOptimizer$BucketMapjoinOptProc

org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.MapJoinFactory$TableScanMapJoinProcessor

org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor

org.apache.hadoop.hive.ql.optimizer.physical.CrossProductCheck$MapJoinCheck

org.apache.hadoop.hive.ql.optimizer.physical.GenMRSkewJoinProcessor

org.apache.hadoop.hive.ql.optimizer.physical.LocalMapJoinProcFactory$LocalMapJoinProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.