Examples of org.apache.hadoop.hive.ql.exec.MapJoinOperator

org.apache.hadoop.hive.ql.exec.MapJoinOperator
Map side Join operator implementation.

        && HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN);




    LinkedHashMap<Operator<? extends Serializable>, OpParseContext> opParseCtxMap = pctx
        .getOpParseCtx();
    MapJoinOperator mapJoinOp = convertMapJoin(opParseCtxMap, op, joinTree, mapJoinPos,
        noCheckOuterJoin);
    // create a dummy select to select all columns
    genSelectPlan(pctx, mapJoinOp);
    return mapJoinOp;
  }

View Full Code Here

        Map.Entry<JoinOperator, QBJoinTree> joinEntry = joinCtxIter.next();
        JoinOperator joinOp = joinEntry.getKey();
        QBJoinTree qbJoin = joinEntry.getValue();
        int mapJoinPos = mapSideJoin(joinOp, qbJoin);
        if (mapJoinPos >= 0) {
          MapJoinOperator mapJoinOp = generateMapJoinOperator(pactx, joinOp, qbJoin, mapJoinPos);
          listMapJoinOps.add(mapJoinOp);
          mapJoinMap.put(mapJoinOp, qbJoin);
        } else {
          joinMap.put(joinOp, qbJoin);
        }

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      MapJoinWalkerCtx ctx = (MapJoinWalkerCtx) procCtx;
      MapJoinOperator mapJoin = (MapJoinOperator) nd;
      if (ctx.getListRejectedMapJoins() != null && !ctx.getListRejectedMapJoins().contains(mapJoin)) {
        // for rule: MapJoin%.*MapJoin
        // have a child mapjoin. if the the current mapjoin is on a local work,
        // will put the current mapjoin in the rejected list.
        Boolean bigBranch = findGrandChildSubqueryMapjoin(ctx, mapJoin);

View Full Code Here

    return new NodeProcessor () {
      @Override
      public Object process(Node nd, Stack<Node> stack,
          NodeProcessorCtx procCtx, Object... nodeOutputs)
          throws SemanticException {
        MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
        BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
        context.listOfRejectedMapjoins.add(mapJoinOp);
        return null;
      }
    };

View Full Code Here


    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
      BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;


      if(context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
        return null;
      }


      QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
      if(joinCxt == null) {
        return null;
      }


      List<String> joinAliases = new ArrayList<String>();
      String[] srcs = joinCxt.getBaseSrc();
      String[] left = joinCxt.getLeftAliases();
      List<String> mapAlias = joinCxt.getMapAliases();
      String baseBigAlias = null;
      for(String s : left) {
        if(s != null && !joinAliases.contains(s)) {
          joinAliases.add(s);
          if(!mapAlias.contains(s)) {
            baseBigAlias = s;
          }
        }
      }
      for(String s : srcs) {
        if(s != null && !joinAliases.contains(s)) {
          joinAliases.add(s);
          if(!mapAlias.contains(s)) {
            baseBigAlias = s;
          }
        }
      }


      MapJoinDesc mjDecs = mapJoinOp.getConf();
      LinkedHashMap<String, Integer> aliasToBucketNumberMapping = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, List<String>> aliasToBucketFileNamesMapping = new LinkedHashMap<String, List<String>>();
      // right now this code does not work with "a join b on a.key = b.key and
      // a.ds = b.ds", where ds is a partition column. It only works with joins
      // with only one partition presents in each join source tables.
      Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext.getTopOps();
      Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();


      // (partition to bucket file names) and (partition to bucket number) for
      // the big table;
      LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
      LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();


      for (int index = 0; index < joinAliases.size(); index++) {
        String alias = joinAliases.get(index);
        TableScanOperator tso = (TableScanOperator) topOps.get(alias);
        if (tso == null) {
          return null;
        }
        Table tbl = topToTable.get(tso);
        if(tbl.isPartitioned()) {
          PrunedPartitionList prunedParts = null;
          try {
            prunedParts = pGraphContext.getOpToPartList().get(tso);
            if (prunedParts == null) {
              prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
                pGraphContext.getPrunedPartitions());
              pGraphContext.getOpToPartList().put(tso, prunedParts);
            }
          } catch (HiveException e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
            throw new SemanticException(e.getMessage(), e);
          }
          int partNumber = prunedParts.getConfirmedPartns().size()
              + prunedParts.getUnknownPartns().size();


          if (partNumber > 1) {
            // only allow one partition for small tables
            if(alias != baseBigAlias) {
              return null;
            }
            // here is the big table,and we get more than one partitions.
            // construct a mapping of (Partition->bucket file names) and
            // (Partition -> bucket number)
            Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                .iterator();
            while (iter.hasNext()) {
              Partition p = iter.next();
              if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
                return null;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p);
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, p.getBucketCount());
            }
            iter = prunedParts.getUnknownPartns().iterator();
            while (iter.hasNext()) {
              Partition p = iter.next();
              if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
                return null;
              }
              List<String> fileNames = getOnePartitionBucketFileNames(p);
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, p.getBucketCount());
            }
            // If there are more than one partition for the big
            // table,aliasToBucketFileNamesMapping and partsToBucketNumber will
            // not contain mappings for the big table. Instead, the mappings are
            // contained in bigTblPartsToBucketFileNames and
            // bigTblPartsToBucketNumber


          } else {
            Partition part = null;
            Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                .iterator();
            if (iter.hasNext()) {
              part = iter.next();
            }
            if (part == null) {
              iter = prunedParts.getUnknownPartns().iterator();
              if (iter.hasNext()) {
                part = iter.next();
              }
            }
            assert part != null;
            Integer num = new Integer(part.getBucketCount());
            aliasToBucketNumberMapping.put(alias, num);
            if (!checkBucketColumns(part.getBucketCols(), mjDecs, index)) {
              return null;
            }
            List<String> fileNames = getOnePartitionBucketFileNames(part);
            aliasToBucketFileNamesMapping.put(alias, fileNames);
            if (alias == baseBigAlias) {
              bigTblPartsToBucketFileNames.put(part, fileNames);
              bigTblPartsToBucketNumber.put(part, num);
            }
          }
        } else {
          if (!checkBucketColumns(tbl.getBucketCols(), mjDecs, index)) {
            return null;
          }
          Integer num = new Integer(tbl.getNumBuckets());
          aliasToBucketNumberMapping.put(alias, num);
          List<String> fileNames = new ArrayList<String>();
          try {
            FileSystem fs = FileSystem.get(tbl.getDataLocation(), this.pGraphContext.getConf());
            FileStatus[] files = fs.listStatus(new Path(tbl.getDataLocation().toString()));
            if(files != null) {
              for(FileStatus file : files) {
                fileNames.add(file.getPath().toString());
              }
            }
          } catch (IOException e) {
            throw new SemanticException(e);
          }
          aliasToBucketFileNamesMapping.put(alias, fileNames);
        }
      }


      // All tables or partitions are bucketed, and their bucket number is
      // stored in 'bucketNumbers', we need to check if the number of buckets in
      // the big table can be divided by no of buckets in small tables.
      if (bigTblPartsToBucketNumber.size() > 0) {
        Iterator<Entry<Partition, Integer>> bigTblPartToBucketNumber = bigTblPartsToBucketNumber
            .entrySet().iterator();
        while (bigTblPartToBucketNumber.hasNext()) {
          int bucketNumberInPart = bigTblPartToBucketNumber.next().getValue();
          if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
              bucketNumberInPart)) {
            return null;
          }
        }
      } else {
        int bucketNoInBigTbl = aliasToBucketNumberMapping.get(baseBigAlias).intValue();
        if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
            bucketNoInBigTbl)) {
          return null;
        }
      }


      MapJoinDesc desc = mapJoinOp.getConf();


      LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>>();


      //sort bucket names for the big table

View Full Code Here

   * The Node Processor for Column Pruning on Map Join Operators.
   */
  public static class ColumnPrunerMapJoinProc implements NodeProcessor {
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      MapJoinOperator op = (MapJoinOperator) nd;
      pruneJoinOperator(ctx, op, op.getConf(), op.getColumnExprMap(), op
          .getConf().getRetainList(), true);
      return null;
    }

View Full Code Here

        throws SemanticException {
      LocalMapJoinProcCtx context = (LocalMapJoinProcCtx) ctx;
      if (!nd.getName().equals("MAPJOIN")) {
        return null;
      }
      MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
      try {
        hasGroupBy(mapJoinOp, context);
      } catch (Exception e) {
        e.printStackTrace();
      }


      HashTableSinkDesc hashTableSinkDesc = new HashTableSinkDesc(mapJoinOp.getConf());
      HashTableSinkOperator hashTableSinkOp = (HashTableSinkOperator) OperatorFactory
          .get(hashTableSinkDesc);


      // set hashtable memory usage
      float hashtableMemoryUsage;
      if (context.isFollowedByGroupBy()) {
        hashtableMemoryUsage = context.getParseCtx().getConf().getFloatVar(
            HiveConf.ConfVars.HIVEHASHTABLEFOLLOWBYGBYMAXMEMORYUSAGE);
      } else {
        hashtableMemoryUsage = context.getParseCtx().getConf().getFloatVar(
            HiveConf.ConfVars.HIVEHASHTABLEMAXMEMORYUSAGE);
      }
      hashTableSinkOp.getConf().setHashtableMemoryUsage(hashtableMemoryUsage);


      // get the last operator for processing big tables
      int bigTable = mapJoinOp.getConf().getPosBigTable();
      Byte[] order = mapJoinOp.getConf().getTagOrder();
      int bigTableAlias = (int) order[bigTable];


      // the parent ops for hashTableSinkOp
      List<Operator<? extends Serializable>> smallTablesParentOp = new ArrayList<Operator<? extends Serializable>>();
      List<Operator<? extends Serializable>> dummyOperators = new ArrayList<Operator<? extends Serializable>>();
      // get all parents
      List<Operator<? extends Serializable>> parentsOp = mapJoinOp.getParentOperators();
      for (int i = 0; i < parentsOp.size(); i++) {
        if (i == bigTableAlias) {
          smallTablesParentOp.add(null);
          continue;
        }
        Operator<? extends Serializable> parent = parentsOp.get(i);
        // let hashtable Op be the child of this parent
        parent.replaceChild(mapJoinOp, hashTableSinkOp);
        // keep the parent id correct
        smallTablesParentOp.add(parent);


        // create an new operator: HashTable DummyOpeator, which share the table desc
        HashTableDummyDesc desc = new HashTableDummyDesc();
        HashTableDummyOperator dummyOp = (HashTableDummyOperator) OperatorFactory.get(desc);
        TableDesc tbl;


        if (parent.getSchema() == null) {
          if (parent instanceof TableScanOperator) {
            tbl = ((TableScanOperator) parent).getTableDesc();
          } else {
            throw new SemanticException();
          }
        } else {
          // get parent schema
          RowSchema rowSchema = parent.getSchema();
          tbl = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(
              rowSchema, ""));
        }
        dummyOp.getConf().setTbl(tbl);
        // let the dummy op be the parent of mapjoin op
        mapJoinOp.replaceParent(parent, dummyOp);
        List<Operator<? extends Serializable>> dummyChildren = new ArrayList<Operator<? extends Serializable>>();
        dummyChildren.add(mapJoinOp);
        dummyOp.setChildOperators(dummyChildren);
        // add this dummy op to the dummp operator list
        dummyOperators.add(dummyOp);

View Full Code Here

    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      if (nd instanceof SMBMapJoinOperator) {
        return null;
      }
      MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
      if (mapJoinOp.getConf().getAliasBucketFileNameMapping() == null
          || mapJoinOp.getConf().getAliasBucketFileNameMapping().size() == 0) {
        return null;
      }


      boolean tableSorted = true;
      QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext()

View Full Code Here

      }


      newPlan.setMapLocalWork(localPlan);


      // construct a map join and set it as the child operator of tblScan_op
      MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory
          .getAndMakeChild(mapJoinDescriptor, (RowSchema) null, parentOps);
      // change the children of the original join operator to point to the map
      // join operator
      List<Operator<? extends Serializable>> childOps = cloneJoinOp
          .getChildOperators();
      for (Operator<? extends Serializable> childOp : childOps) {
        childOp.replaceParent(cloneJoinOp, mapJoinOp);
      }
      mapJoinOp.setChildOperators(childOps);


      HiveConf jc = new HiveConf(parseCtx.getConf(),
          GenMRSkewJoinProcessor.class);


      newPlan.setNumMapTasks(HiveConf

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      MapJoinWalkerCtx ctx = (MapJoinWalkerCtx) procCtx;
      MapJoinOperator mapJoin = (MapJoinOperator) nd;
      if (ctx.getListRejectedMapJoins() != null && !ctx.getListRejectedMapJoins().contains(mapJoin)) {
        // for rule: MapJoin%.*MapJoin
        // have a child mapjoin. if the the current mapjoin is on a local work,
        // will put the current mapjoin in the rejected list.
        Boolean bigBranch = findGrandChildSubqueryMapjoin(ctx, mapJoin);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.MapJoinOperator

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.ql.exec.HashTableSinkOperator.HashTableSinkObjectCtx

org.apache.hadoop.hive.ql.exec.persistence.AbstractMapJoinKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectKey

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext

org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue

org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer

org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.