Examples of org.apache.hadoop.hive.ql.plan.MapJoinDesc

org.apache.hadoop.hive.ql.plan.MapJoinDesc
Map Join operator Descriptor implementation.

      }
      dumpFilePrefix = dumpFilePrefix+"-"+PlanUtils.getCountForMapJoinDumpFilePrefix();
    } else {
      dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
    }
    MapJoinDesc mapJoinDescriptor = new MapJoinDesc(keyExprMap, keyTableDesc, valueExprMap,
        valueTableDescs, valueFiltedTableDescs, outputColumnNames, mapJoinPos, joinCondns,
        filterMap, op.getConf().getNoOuterJoin(), dumpFilePrefix);
    mapJoinDescriptor.setTagOrder(tagOrder);


    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(
        mapJoinDescriptor, new RowSchema(outputRS.getColumnInfos()), newPar);


    OpParseContext ctx = new OpParseContext(outputRS);

View Full Code Here

            baseBigAlias = s;
          }
        }
      }


      MapJoinDesc mjDecs = mapJoinOp.getConf();
      LinkedHashMap<String, Integer> aliasToBucketNumberMapping = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, List<String>> aliasToBucketFileNamesMapping = new LinkedHashMap<String, List<String>>();
      // right now this code does not work with "a join b on a.key = b.key and
      // a.ds = b.ds", where ds is a partition column. It only works with joins
      // with only one partition presents in each join source tables.
      Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext.getTopOps();
      Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();


      // (partition to bucket file names) and (partition to bucket number) for
      // the big table;
      LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
      LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();


      for (int index = 0; index < joinAliases.size(); index++) {
        String alias = joinAliases.get(index);
        TableScanOperator tso = (TableScanOperator) topOps.get(alias);
        if (tso == null) {
          return null;
        }
        Table tbl = topToTable.get(tso);
        if(tbl.isPartitioned()) {
          PrunedPartitionList prunedParts = null;
          try {
            prunedParts = pGraphContext.getOpToPartList().get(tso);
            if (prunedParts == null) {
              prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
                pGraphContext.getPrunedPartitions());
              pGraphContext.getOpToPartList().put(tso, prunedParts);
            }
          } catch (HiveException e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
            throw new SemanticException(e.getMessage(), e);
          }
          int partNumber = prunedParts.getConfirmedPartns().size()
              + prunedParts.getUnknownPartns().size();


          if (partNumber > 1) {
            // only allow one partition for small tables
            if(alias != baseBigAlias) {
              return null;
            }
            // here is the big table,and we get more than one partitions.
            // construct a mapping of (Partition->bucket file names) and
            // (Partition -> bucket number)
            Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                .iterator();
            while (iter.hasNext()) {
              Partition p = iter.next();
              if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
                return null;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p);
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, p.getBucketCount());
            }
            iter = prunedParts.getUnknownPartns().iterator();
            while (iter.hasNext()) {
              Partition p = iter.next();
              if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
                return null;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p);
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, p.getBucketCount());
            }
            // If there are more than one partition for the big
            // table,aliasToBucketFileNamesMapping and partsToBucketNumber will
            // not contain mappings for the big table. Instead, the mappings are
            // contained in bigTblPartsToBucketFileNames and
            // bigTblPartsToBucketNumber


          } else {
            Partition part = null;
            Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                .iterator();
            if (iter.hasNext()) {
              part = iter.next();
            }
            if (part == null) {
              iter = prunedParts.getUnknownPartns().iterator();
              if (iter.hasNext()) {
                part = iter.next();
              }
            }
            assert part != null;
            Integer num = new Integer(part.getBucketCount());
            aliasToBucketNumberMapping.put(alias, num);
            if (!checkBucketColumns(part.getBucketCols(), mjDecs, index)) {
              return null;
            }
            List<String> fileNames = getOnePartitionBucketFileNames(part);
            aliasToBucketFileNamesMapping.put(alias, fileNames);
            if (alias == baseBigAlias) {
              bigTblPartsToBucketFileNames.put(part, fileNames);
              bigTblPartsToBucketNumber.put(part, num);
            }
          }
        } else {
          if (!checkBucketColumns(tbl.getBucketCols(), mjDecs, index)) {
            return null;
          }
          Integer num = new Integer(tbl.getNumBuckets());
          aliasToBucketNumberMapping.put(alias, num);
          List<String> fileNames = new ArrayList<String>();
          try {
            FileSystem fs = FileSystem.get(tbl.getDataLocation(), this.pGraphContext.getConf());
            FileStatus[] files = fs.listStatus(new Path(tbl.getDataLocation().toString()));
            if(files != null) {
              for(FileStatus file : files) {
                fileNames.add(file.getPath().toString());
              }
            }
          } catch (IOException e) {
            throw new SemanticException(e);
          }
          aliasToBucketFileNamesMapping.put(alias, fileNames);
        }
      }


      // All tables or partitions are bucketed, and their bucket number is
      // stored in 'bucketNumbers', we need to check if the number of buckets in
      // the big table can be divided by no of buckets in small tables.
      if (bigTblPartsToBucketNumber.size() > 0) {
        Iterator<Entry<Partition, Integer>> bigTblPartToBucketNumber = bigTblPartsToBucketNumber
            .entrySet().iterator();
        while (bigTblPartToBucketNumber.hasNext()) {
          int bucketNumberInPart = bigTblPartToBucketNumber.next().getValue();
          if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
              bucketNumberInPart)) {
            return null;
          }
        }
      } else {
        int bucketNoInBigTbl = aliasToBucketNumberMapping.get(baseBigAlias).intValue();
        if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
            bucketNoInBigTbl)) {
          return null;
        }
      }


      MapJoinDesc desc = mapJoinOp.getConf();


      LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>>();


      //sort bucket names for the big table
      if(bigTblPartsToBucketNumber.size() > 0) {
        Collection<List<String>> bucketNamesAllParts = bigTblPartsToBucketFileNames.values();
        for(List<String> partBucketNames : bucketNamesAllParts) {
          Collections.sort(partBucketNames);
        }
      } else {
        Collections.sort(aliasToBucketFileNamesMapping.get(baseBigAlias));
      }


      // go through all small tables and get the mapping from bucket file name
      // in the big table to bucket file names in small tables.
      for (int j = 0; j < joinAliases.size(); j++) {
        String alias = joinAliases.get(j);
        if(alias.equals(baseBigAlias)) {
          continue;
        }
        Collections.sort(aliasToBucketFileNamesMapping.get(alias));
        LinkedHashMap<String, ArrayList<String>> mapping = new LinkedHashMap<String, ArrayList<String>>();
        aliasBucketFileNameMapping.put(alias, mapping);


        // for each bucket file in big table, get the corresponding bucket file
        // name in the small table.
        if (bigTblPartsToBucketNumber.size() > 0) {
          //more than 1 partition in the big table, do the mapping for each partition
          Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames = bigTblPartsToBucketFileNames
              .entrySet().iterator();
          Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
              .entrySet().iterator();
          while (bigTblPartToBucketNames.hasNext()) {
            assert bigTblPartToBucketNum.hasNext();
            int bigTblBucketNum = bigTblPartToBucketNum.next().getValue().intValue();
            List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
            fillMapping(baseBigAlias, aliasToBucketNumberMapping,
                aliasToBucketFileNamesMapping, alias, mapping, bigTblBucketNum,
                bigTblBucketNameList, desc.getBucketFileNameMapping());
          }
        } else {
          List<String> bigTblBucketNameList = aliasToBucketFileNamesMapping.get(baseBigAlias);
          int bigTblBucketNum =  aliasToBucketNumberMapping.get(baseBigAlias);
          fillMapping(baseBigAlias, aliasToBucketNumberMapping,
              aliasToBucketFileNamesMapping, alias, mapping, bigTblBucketNum,
              bigTblBucketNameList, desc.getBucketFileNameMapping());
        }
      }
      desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
      desc.setBigTableAlias(baseBigAlias);
      return null;
    }

View Full Code Here

      Operator<? extends Serializable> reducer = clonePlan.getReducer();
      assert reducer instanceof JoinOperator;
      JoinOperator cloneJoinOp = (JoinOperator) reducer;


      String dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
      MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc,
          newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc,joinDescriptor
          .getOutputColumnNames(), i, joinDescriptor.getConds(),
          joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
      mapJoinDescriptor.setTagOrder(tags);
      mapJoinDescriptor.setHandleSkewJoin(false);


      MapredLocalWork localPlan = new MapredLocalWork(
          new LinkedHashMap<String, Operator<? extends Serializable>>(),
          new LinkedHashMap<String, FetchWork>());
      Map<Byte, String> smallTblDirs = smallKeysDirMap.get(src);

View Full Code Here

        initUnionPlan(opProcCtx, currTask, false);
      }


      opProcCtx.setCurrMapJoinOp(null);
    } else {
      MapJoinDesc desc = (MapJoinDesc) op.getConf();


      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);


      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      rootTasks.add(currTask);


      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();


      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
      setupBucketMapJoinInfo(plan, (AbstractMapJoinOperator<? extends MapJoinDesc>)op, createLocalPlan);
    }


    opProcCtx.setCurrTask(currTask);

View Full Code Here

      Operator<? extends Serializable> reducer = clonePlan.getReducer();
      assert reducer instanceof JoinOperator;
      JoinOperator cloneJoinOp = (JoinOperator) reducer;


      String dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
      MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc,
          newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc,joinDescriptor
          .getOutputColumnNames(), i, joinDescriptor.getConds(),
          joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
      mapJoinDescriptor.setTagOrder(tags);
      mapJoinDescriptor.setHandleSkewJoin(false);
      mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());


      MapredLocalWork localPlan = new MapredLocalWork(
          new LinkedHashMap<String, Operator<? extends Serializable>>(),
          new LinkedHashMap<String, FetchWork>());
      Map<Byte, String> smallTblDirs = smallKeysDirMap.get(src);

View Full Code Here

      }
      dumpFilePrefix = dumpFilePrefix+"-"+PlanUtils.getCountForMapJoinDumpFilePrefix();
    } else {
      dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
    }
    MapJoinDesc mapJoinDescriptor = new MapJoinDesc(keyExprMap, keyTableDesc, valueExprMap,
        valueTableDescs, valueFiltedTableDescs, outputColumnNames, mapJoinPos, joinCondns,
        filterMap, op.getConf().getNoOuterJoin(), dumpFilePrefix);
    mapJoinDescriptor.setTagOrder(tagOrder);
    mapJoinDescriptor.setNullSafes(desc.getNullSafes());


    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(
        mapJoinDescriptor, new RowSchema(outputRS.getColumnInfos()), newPar);


    OpParseContext ctx = new OpParseContext(outputRS);

View Full Code Here

        initUnionPlan(opProcCtx, currTask, false);
      }


      opProcCtx.setCurrMapJoinOp(null);
    } else {
      MapJoinDesc desc = (MapJoinDesc) op.getConf();


      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);


      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      if (!rootTasks.contains(currTask)) {
        rootTasks.add(currTask);
      }


      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();


      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
      setupBucketMapJoinInfo(plan, (AbstractMapJoinOperator<? extends MapJoinDesc>)op, createLocalPlan);
    }


    opProcCtx.setCurrTask(currTask);

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      @SuppressWarnings("unchecked")
      AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = (AbstractMapJoinOperator<? extends MapJoinDesc>) nd;
      MapJoinDesc mjDesc = mjOp.getConf();


      String bigTablAlias = mjDesc.getBigTableAlias();
      if ( bigTablAlias == null ) {
        Operator<? extends OperatorDesc> parent = null;
        for(Operator<? extends OperatorDesc> op : mjOp.getParentOperators() ) {
          if ( op instanceof TableScanOperator ) {
            parent = op;
          }
        }
        if ( parent != null) {
          TableScanDesc tDesc = ((TableScanOperator)parent).getConf();
          bigTablAlias = tDesc.getAlias();
        }
      }
      bigTablAlias = bigTablAlias == null ? "?" : bigTablAlias;


      List<ExprNodeDesc> joinExprs = mjDesc.getKeys().values().iterator().next();


      if ( joinExprs.size() == 0 ) {
        warnings.add(
            String.format("Map Join %s[bigTable=%s] in task '%s' is a cross product",
                mjOp.toString(), bigTablAlias, taskName));

View Full Code Here

    TableScanDesc desc = op.getConf();
    return !desc.isGatherStats();
  }


  private boolean validateMapJoinOperator(MapJoinOperator op) {
    MapJoinDesc desc = op.getConf();
    return validateMapJoinDesc(desc);
  }

View Full Code Here

      Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
      assert reducer instanceof JoinOperator;
      JoinOperator cloneJoinOp = (JoinOperator) reducer;


      String dumpFilePrefix = "mapfile"+PlanUtils.getCountForMapJoinDumpFilePrefix();
      MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc,
          newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc,joinDescriptor
          .getOutputColumnNames(), i, joinDescriptor.getConds(),
          joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix);
      mapJoinDescriptor.setTagOrder(tags);
      mapJoinDescriptor.setHandleSkewJoin(false);
      mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());


      MapredLocalWork localPlan = new MapredLocalWork(
          new LinkedHashMap<String, Operator<? extends OperatorDesc>>(),
          new LinkedHashMap<String, FetchWork>());
      Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.MapJoinDesc

org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator

org.apache.hadoop.hive.ql.optimizer.AbstractBucketJoinProc

org.apache.hadoop.hive.ql.optimizer.BucketMapJoinOptimizer$BucketMapjoinOptProc

org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin

org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils

org.apache.hadoop.hive.ql.optimizer.MapJoinFactory$TableScanMapJoinProcessor

org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor

org.apache.hadoop.hive.ql.optimizer.physical.CrossProductCheck$MapJoinCheck

org.apache.hadoop.hive.ql.optimizer.physical.GenMRSkewJoinProcessor

org.apache.hadoop.hive.ql.optimizer.physical.LocalMapJoinProcFactory$LocalMapJoinProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.