Package org.apache.hadoop.hive.ql.plan

Examples of org.apache.hadoop.hive.ql.plan.Statistics


    private long evaluateExpression(Statistics stats, ExprNodeDesc pred,
        AnnotateStatsProcCtx aspCtx, List<String> neededCols,
        FilterOperator fop) throws CloneNotSupportedException {
      long newNumRows = 0;
      Statistics andStats = null;

      if (pred instanceof ExprNodeGenericFuncDesc) {
        ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
        GenericUDF udf = genFunc.getGenericUDF();
View Full Code Here


  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, List<ColumnInfo> schema, List<String> neededColumns,
      List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats)
      throws HiveException {

    Statistics stats = new Statistics();

    float deserFactor =
        HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);

    if (!table.isPartitioned()) {
      long nr = getNumRows(table);
      long ds = getRawDataSize(table);
      if (ds <= 0) {
        ds = getTotalSize(table);

        // if data size is still 0 then get file size
        if (ds <= 0) {
          ds = getFileSizeForTable(conf, table);
        }

        ds = (long) (ds * deserFactor);
      }

      // number of rows -1 means that statistics from metastore is not reliable
      // and 0 means statistics gathering is disabled
      if (nr <= 0) {
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Estimated average row size: " + avgRowSize);
          }
          nr = ds / avgRowSize;
        }
      }
      stats.setNumRows(nr);
      stats.setDataSize(ds);

      List<ColStatistics> colStats = Lists.newArrayList();
      if (fetchColStats) {
        colStats = getTableColumnStats(table, schema, neededColumns);
      }

      // infer if any column can be primary key based on column statistics
      inferAndSetPrimaryKey(stats.getNumRows(), colStats);

      stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
      stats.addToColumnStats(colStats);
    } else if (partList != null) {
      // For partitioned tables, get the size of all the partitions after pruning
      // the partitions that are not required
      long nr = 0;
      long ds = 0;

      List<Long> rowCounts = Lists.newArrayList();
      List<Long> dataSizes = Lists.newArrayList();

      if (fetchPartStats) {
        rowCounts = getBasicStatForPartitions(
            table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
        dataSizes =  getBasicStatForPartitions(
            table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);

        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);
        if (ds <= 0) {
          dataSizes = getBasicStatForPartitions(
              table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
          ds = getSumIgnoreNegatives(dataSizes);
        }
      }

      // if data size still could not be determined, then fall back to filesytem to get file
      // sizes
      if (ds <= 0) {
        dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
      }
      ds = getSumIgnoreNegatives(dataSizes);
      ds = (long) (ds * deserFactor);

      int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
      if (avgRowSize > 0) {
        setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);

        // number of rows -1 means that statistics from metastore is not reliable
        if (nr <= 0) {
          nr = ds / avgRowSize;
        }
      }
      stats.addToNumRows(nr);
      stats.addToDataSize(ds);

      // if at least a partition does not contain row count then mark basic stats state as PARTIAL
      if (containsNonPositives(rowCounts) &&
          stats.getBasicStatsState().equals(State.COMPLETE)) {
        stats.setBasicStatsState(State.PARTIAL);
      }
      if (fetchColStats) {
        List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
        for (Partition part : partList.getNotDeniedPartns()) {
          partNames.add(part.getName());
        }
        Map<String, String> colToTabAlias = new HashMap<String, String>();
        neededColumns = processNeededColumns(schema, neededColumns, colToTabAlias);
        AggrStats aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(),
            neededColumns, partNames);
        if (null == aggrStats) {
          // There are some partitions with no state (or we didn't fetch any state).
          // Update the stats with empty list to reflect that in the
          // state/initialize structures.
          List<ColStatistics> emptyStats = Lists.newArrayList();

          // add partition column stats
          addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList,
              emptyStats);

          stats.addToColumnStats(emptyStats);
          stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
        } else {
          List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
          if (colStats.size() != neededColumns.size()) {
            LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" +
                " retrieve for " + colStats.size() + " columns");
          }
          List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName(),
              colToTabAlias);

          addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList,
              columnStats);

          // infer if any column can be primary key based on column statistics
          inferAndSetPrimaryKey(stats.getNumRows(), columnStats);

          stats.addToColumnStats(columnStats);
          State colState = deriveStatType(columnStats, referencedColumns);
          if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
            LOG.debug("Column stats requested for : " + partNames.size() + " partitions. "
                + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
            colState = State.PARTIAL;
          }
          stats.setColumnStatsState(colState);
        }
      }
    }
    return stats;
  }
View Full Code Here

              // add empty stats object for each column
              hiveColStats.add(new ColStatistics(hiveTblMetadata.getTableName(), c, null));
            }
            colNamesFailedStats.clear();
          } else {
            Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList,
                hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats,
                nonPartColNamesThatRqrStats, true, true);
            rowCount = stats.getNumRows();
            hiveColStats = new ArrayList<ColStatistics>();
            for (String c : nonPartColNamesThatRqrStats) {
              ColStatistics cs = stats.getColumnStatisticsFromColName(c);
              if (cs != null) {
                hiveColStats.add(cs);
              } else {
                colNamesFailedStats.add(c);
              }
View Full Code Here

    if (pos == -1) {
      throw new SemanticException("Cannot find position of parent in mapjoin");
    }
    MapJoinDesc joinConf = mapJoinOp.getConf();
    long keyCount = Long.MAX_VALUE, rowCount = Long.MAX_VALUE, bucketCount = 1;
    Statistics stats = parentRS.getStatistics();
    if (stats != null) {
      keyCount = rowCount = stats.getNumRows();
      if (keyCount <= 0) {
        keyCount = rowCount = Long.MAX_VALUE;
      }
      ArrayList<String> keyCols = parentRS.getConf().getOutputKeyColumnNames();
      if (keyCols != null && !keyCols.isEmpty()) {
View Full Code Here

   * @throws HiveException
   */
  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, TableScanOperator tableScanOperator) {

    Statistics stats = new Statistics();

    // column level statistics are required only for the columns that are needed
    List<ColumnInfo> schema = tableScanOperator.getSchema().getSignature();
    List<String> neededColumns = tableScanOperator.getNeededColumns();
    boolean fetchColStats =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean fetchPartStats =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS);
    float deserFactor =
        HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);

    if (!table.isPartitioned()) {
      long nr = getNumRows(table);
      long ds = getRawDataSize(table);
      if (ds <= 0) {
        ds = getTotalSize(table);

        // if data size is still 0 then get file size
        if (ds <= 0) {
          ds = getFileSizeForTable(conf, table);
        }

        ds = (long) (ds * deserFactor);
      }

      // number of rows -1 means that statistics from metastore is not reliable
      // and 0 means statistics gathering is disabled
      if (nr <= 0) {
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Estimated average row size: " + avgRowSize);
          }
          nr = ds / avgRowSize;
        }
      }
      stats.setNumRows(nr);
      stats.setDataSize(ds);

      List<ColStatistics> colStats = Lists.newArrayList();
      if (fetchColStats) {
        colStats = getTableColumnStats(table, schema, neededColumns);
      }

      stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
      stats.addToColumnStats(colStats);
    } else if (partList != null) {
      // For partitioned tables, get the size of all the partitions after pruning
      // the partitions that are not required
      long nr = 0;
      long ds = 0;

      List<Long> rowCounts = Lists.newArrayList();
      List<Long> dataSizes = Lists.newArrayList();

      if (fetchPartStats) {
        rowCounts = getBasicStatForPartitions(
            table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
        dataSizes =  getBasicStatForPartitions(
            table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);

        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);
        if (ds <= 0) {
          dataSizes = getBasicStatForPartitions(
              table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
          ds = getSumIgnoreNegatives(dataSizes);
        }
      }

      // if data size still could not be determined, then fall back to filesytem to get file
      // sizes
      if (ds <= 0) {
        dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
      }
      ds = getSumIgnoreNegatives(dataSizes);
      ds = (long) (ds * deserFactor);

      int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
      if (avgRowSize > 0) {
        setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);

        // number of rows -1 means that statistics from metastore is not reliable
        if (nr <= 0) {
          nr = ds / avgRowSize;
        }
      }
      stats.addToNumRows(nr);
      stats.addToDataSize(ds);

      // if at least a partition does not contain row count then mark basic stats state as PARTIAL
      if (containsNonPositives(rowCounts)) {
        stats.setBasicStatsState(State.PARTIAL);
      }
      boolean haveFullStats = fetchColStats;
      if (fetchColStats) {
        List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
        for (Partition part : partList.getNotDeniedPartns()) {
          partNames.add(part.getName());
        }
        Map<String, List<ColStatistics>> partStats =
            getPartColumnStats(table, schema, partNames, neededColumns);
        if (partStats != null) {
          for (String partName : partNames) {
            List<ColStatistics> partStat = partStats.get(partName);
            haveFullStats &= (partStat != null);
            if (partStat != null) {
              stats.updateColumnStatsState(deriveStatType(partStat, neededColumns));
              stats.addToColumnStats(partStat);
            }
          }
        }
      }
      // There are some partitions with no state (or we didn't fetch any state).
      // Update the stats with empty list to reflect that in the state/initialize structures.
      if (!haveFullStats) {
        List<ColStatistics> emptyStats = Lists.<ColStatistics>newArrayList();
        stats.addToColumnStats(emptyStats);
        stats.updateColumnStatsState(deriveStatType(emptyStats, neededColumns));
      }
    }
    return stats;
  }
View Full Code Here

    long maxSize = context.conf.getLongVar(
        HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);

    int bigTablePosition = -1;

    Statistics bigInputStat = null;
    long totalSize = 0;
    int pos = 0;

    // bigTableFound means we've encountered a table that's bigger than the
    // max. This table is either the the big table or we cannot convert.
    boolean bigTableFound = false;

    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {

      Statistics currInputStat = parentOp.getStatistics();
      if (currInputStat == null) {
        LOG.warn("Couldn't get statistics from: "+parentOp);
        return -1;
      }

      long inputSize = currInputStat.getDataSize();
      if ((bigInputStat == null) ||
          ((bigInputStat != null) &&
           (inputSize > bigInputStat.getDataSize()))) {

        if (bigTableFound) {
          // cannot convert to map join; we've already chosen a big table
          // on size and there's another one that's bigger.
          return -1;
        }

        if (inputSize/buckets > maxSize) {
          if (!bigTableCandidateSet.contains(pos)) {
            // can't use the current table as the big table, but it's too
            // big for the map side.
            return -1;
          }

          bigTableFound = true;
        }

        if (bigInputStat != null) {
          // we're replacing the current big table with a new one. Need
          // to count the current one as a map table then.
          totalSize += bigInputStat.getDataSize();
        }

        if (totalSize/buckets > maxSize) {
          // sum of small tables size in this join exceeds configured limit
          // hence cannot convert.
          return -1;
        }

        if (bigTableCandidateSet.contains(pos)) {
          bigTablePosition = pos;
          bigInputStat = currInputStat;
        }
      } else {
        totalSize += currInputStat.getDataSize();
        if (totalSize/buckets > maxSize) {
          // cannot hold all map tables in memory. Cannot convert.
          return -1;
        }
      }
View Full Code Here

    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
      FilterOperator fop = (FilterOperator) nd;
      Operator<? extends OperatorDesc> parent = fop.getParentOperators().get(0);
      Statistics parentStats = parent.getStatistics();
      List<String> neededCols = null;
      if (parent instanceof TableScanOperator) {
        TableScanOperator tsop = (TableScanOperator) parent;
        neededCols = tsop.getNeededColumns();
      }

      try {
        if (parentStats != null) {
          ExprNodeDesc pred = fop.getConf().getPredicate();

          // evaluate filter expression and update statistics
          long newNumRows = evaluateExpression(parentStats, pred, aspCtx, neededCols);
          Statistics st = parentStats.clone();

          if (satisfyPrecondition(parentStats)) {

            // update statistics based on column statistics.
            // OR conditions keeps adding the stats independently, this may
            // result in number of rows getting more than the input rows in
            // which case stats need not be updated
            if (newNumRows <= parentStats.getNumRows()) {
              updateStats(st, newNumRows, true);
            }

            if (LOG.isDebugEnabled()) {
              LOG.debug("[0] STATS-" + fop.toString() + ": " + st.extendedToString());
            }
          } else {

            // update only the basic statistics in the absence of column statistics
            if (newNumRows <= parentStats.getNumRows()) {
              updateStats(st, newNumRows, false);
            }

            if (LOG.isDebugEnabled()) {
              LOG.debug("[1] STATS-" + fop.toString() + ": " + st.extendedToString());
            }
          }
          fop.setStatistics(st);
          aspCtx.setAndExprStats(null);
        }
View Full Code Here

    }

    private long evaluateExpression(Statistics stats, ExprNodeDesc pred,
        AnnotateStatsProcCtx aspCtx, List<String> neededCols) throws CloneNotSupportedException {
      long newNumRows = 0;
      Statistics andStats = null;
      if (pred instanceof ExprNodeGenericFuncDesc) {
        ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
        GenericUDF udf = genFunc.getGenericUDF();

        // for AND condition cascadingly update stats
View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      GroupByOperator gop = (GroupByOperator) nd;
      Operator<? extends OperatorDesc> parent = gop.getParentOperators().get(0);
      Statistics parentStats = parent.getStatistics();
      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
      HiveConf conf = aspCtx.getConf();
      int mapSideParallelism =
          HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_SIDE_PARALLELISM);
      List<AggregationDesc> aggDesc = gop.getConf().getAggregators();
      Map<String, ExprNodeDesc> colExprMap = gop.getColumnExprMap();
      RowSchema rs = gop.getSchema();
      Statistics stats = null;

      try {
        if (satisfyPrecondition(parentStats)) {
          stats = parentStats.clone();

          List<ColStatistics> colStats =
              StatsUtils.getColStatisticsFromExprMap(conf, parentStats, colExprMap, rs);
          stats.setColumnStats(colStats);
          long dvProd = 1;
          long newNumRows = 0;

          // compute product of distinct values of grouping columns
          for (ColStatistics cs : colStats) {
            if (cs != null) {
              long dv = cs.getCountDistint();
              if (cs.getNumNulls() > 0) {
                dv += 1;
              }
              dvProd *= dv;
            } else {

              // partial column statistics on grouping attributes case.
              // if column statistics on grouping attribute is missing, then
              // assume worst case.
              // GBY rule will emit half the number of rows if dvProd is 0
              dvProd = 0;
              break;
            }
          }

          // map side
          if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator) {

            // since we do not know if hash-aggregation will be enabled or disabled
            // at runtime we will assume that map-side group by does not do any
            // reduction.hence no group by rule will be applied

            // map-side grouping set present. if grouping set is present then
            // multiply the number of rows by number of elements in grouping set
            if (gop.getConf().isGroupingSetsPresent()) {
              int multiplier = gop.getConf().getListGroupingSets().size();

              // take into account the map-side parallelism as well, default is 1
              multiplier *= mapSideParallelism;
              newNumRows = multiplier * stats.getNumRows();
              long dataSize = multiplier * stats.getDataSize();
              stats.setNumRows(newNumRows);
              stats.setDataSize(dataSize);
              for (ColStatistics cs : colStats) {
                if (cs != null) {
                  long oldNumNulls = cs.getNumNulls();
                  long newNumNulls = multiplier * oldNumNulls;
                  cs.setNumNulls(newNumNulls);
                }
              }
            } else {

              // map side no grouping set
              newNumRows = stats.getNumRows() * mapSideParallelism;
              updateStats(stats, newNumRows, true);
            }
          } else {

            // reduce side
            newNumRows = applyGBYRule(stats.getNumRows(), dvProd);
            updateStats(stats, newNumRows, true);
          }
        } else {
          if (parentStats != null) {

            // worst case, in the absence of column statistics assume half the rows are emitted
            if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator) {

              // map side
              stats = parentStats.clone();
            } else {

              // reduce side
              stats = parentStats.clone();
              long newNumRows = parentStats.getNumRows() / 2;
              updateStats(stats, newNumRows, false);
            }
          }
        }

        // if UDAFs are present, new columns needs to be added
        if (!aggDesc.isEmpty() && stats != null) {
          List<ColStatistics> aggColStats = Lists.newArrayList();
          for (ColumnInfo ci : rs.getSignature()) {

            // if the columns in row schema is not contained in column
            // expression map, then those are the aggregate columns that
            // are added GBY operator. we will estimate the column statistics
            // for those newly added columns
            if (!colExprMap.containsKey(ci.getInternalName())) {
              String colName = ci.getInternalName();
              colName = StatsUtils.stripPrefixFromColumnName(colName);
              String tabAlias = ci.getTabAlias();
              String colType = ci.getTypeName();
              ColStatistics cs = new ColStatistics(tabAlias, colName, colType);
              cs.setCountDistint(stats.getNumRows());
              cs.setNumNulls(0);
              cs.setAvgColLen(StatsUtils.getAvgColLenOfFixedLengthTypes(colType));
              aggColStats.add(cs);
            }
          }
          stats.addToColumnStats(aggColStats);

          // if UDAF present and if column expression map is empty then it must
          // be full aggregation query like count(*) in which case number of
          // rows will be 1
          if (colExprMap.isEmpty()) {
            stats.setNumRows(1);
            updateStats(stats, 1, true);
          }
        }

        gop.setStatistics(stats);

        if (LOG.isDebugEnabled() && stats != null) {
          LOG.debug("[0] STATS-" + gop.toString() + ": " + stats.extendedToString());
        }
      } catch (CloneNotSupportedException e) {
        throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
      }
      return null;
View Full Code Here

        if (allSatisfyPreCondition) {

          // statistics object that is combination of statistics from all
          // relations involved in JOIN
          Statistics stats = new Statistics();
          long prodRows = 1;
          List<Long> distinctVals = Lists.newArrayList();
          boolean multiAttr = false;

          Map<String, ColStatistics> joinedColStats = Maps.newHashMap();
          Map<Integer, List<String>> joinKeys = Maps.newHashMap();

          // get the join keys from parent ReduceSink operators
          for (int pos = 0; pos < parents.size(); pos++) {
            ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);

            Statistics parentStats = parent.getStatistics();
            prodRows *= parentStats.getNumRows();
            List<ExprNodeDesc> keyExprs = parent.getConf().getKeyCols();

            // multi-attribute join key
            if (keyExprs.size() > 1) {
              multiAttr = true;
            }

            // compute fully qualified join key column names. this name will be
            // used to quickly look-up for column statistics of join key.
            // TODO: expressions in join condition will be ignored. assign
            // internal name for expressions and estimate column statistics for expression.
            List<String> fqCols =
                StatsUtils.getFullQualifedColNameFromExprs(keyExprs, parent.getColumnExprMap());
            joinKeys.put(pos, fqCols);

            Map<String, ExprNodeDesc> colExprMap = parent.getColumnExprMap();
            RowSchema rs = parent.getSchema();

            // get column statistics for all output columns
            List<ColStatistics> cs =
                StatsUtils.getColStatisticsFromExprMap(conf, parentStats, colExprMap, rs);
            for (ColStatistics c : cs) {
              if (c != null) {
                joinedColStats.put(c.getFullyQualifiedColName(), c);
              }
            }

            // since new statistics is derived from all relations involved in
            // JOIN, we need to update the state information accordingly
            stats.updateColumnStatsState(parentStats.getColumnStatsState());
          }

          // compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single
          // attribute join, else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2))
          // in case of multi-attribute join
          long denom = 1;
          if (multiAttr) {
            List<Long> perAttrDVs = Lists.newArrayList();
            int numAttr = joinKeys.get(0).size();
            for (int idx = 0; idx < numAttr; idx++) {
              for (Integer i : joinKeys.keySet()) {
                String col = joinKeys.get(i).get(idx);
                ColStatistics cs = joinedColStats.get(col);
                if (cs != null) {
                  perAttrDVs.add(cs.getCountDistint());
                }
              }
              distinctVals.add(getDenominator(perAttrDVs));
              perAttrDVs.clear();
            }

            for (Long l : distinctVals) {
              denom *= l;
            }
          } else {
            for (List<String> jkeys : joinKeys.values()) {
              for (String jk : jkeys) {
                ColStatistics cs = joinedColStats.get(jk);
                if (cs != null) {
                  distinctVals.add(cs.getCountDistint());
                }
              }
            }
            denom = getDenominator(distinctVals);
          }

          // column statistics from different sources are put together and rename
          // fully qualified column names based on output schema of join operator
          Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
          RowSchema rs = jop.getSchema();
          List<ColStatistics> outColStats = Lists.newArrayList();
          for (ColumnInfo ci : rs.getSignature()) {
            String key = ci.getInternalName();
            ExprNodeDesc end = colExprMap.get(key);
            if (end instanceof ExprNodeColumnDesc) {
              String colName = ((ExprNodeColumnDesc) end).getColumn();
              colName = StatsUtils.stripPrefixFromColumnName(colName);
              String tabAlias = ((ExprNodeColumnDesc) end).getTabAlias();
              String fqColName = StatsUtils.getFullyQualifiedColumnName(tabAlias, colName);
              ColStatistics cs = joinedColStats.get(fqColName);
              String outColName = key;
              String outTabAlias = ci.getTabAlias();
              outColName = StatsUtils.stripPrefixFromColumnName(outColName);
              if (cs != null) {
                cs.setColumnName(outColName);
                cs.setTableAlias(outTabAlias);
              }
              outColStats.add(cs);
            }
          }

          // update join statistics
          stats.setColumnStats(outColStats);
          long newRowCount = prodRows / denom;
          stats.setNumRows(newRowCount);
          stats.setDataSize(StatsUtils.getDataSizeFromColumnStats(newRowCount, outColStats));
          jop.setStatistics(stats);

          if (LOG.isDebugEnabled()) {
            LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString());
          }
        } else {

          // worst case when there are no column statistics
          float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR);
          int numParents = parents.size();
          List<Long> parentRows = Lists.newArrayList();
          List<Long> parentSizes = Lists.newArrayList();
          int maxRowIdx = 0;
          long maxRowCount = 0;
          int idx = 0;

          for (Operator<? extends OperatorDesc> op : parents) {
            Statistics ps = op.getStatistics();
            long rowCount = ps.getNumRows();
            if (rowCount > maxRowCount) {
              maxRowCount = rowCount;
              maxRowIdx = idx;
            }
            parentRows.add(rowCount);
            parentSizes.add(ps.getDataSize());
            idx++;
          }

          long maxDataSize = parentSizes.get(maxRowIdx);
          long newNumRows = (long) (joinFactor * maxRowCount * (numParents - 1));
          long newDataSize = (long) (joinFactor * maxDataSize * (numParents - 1));

          Statistics wcStats = new Statistics();
          wcStats.setNumRows(newNumRows);
          wcStats.setDataSize(newDataSize);
          jop.setStatistics(wcStats);

          if (LOG.isDebugEnabled()) {
            LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString());
          }
        }
      }
      return null;
    }
View Full Code Here

TOP

Related Classes of org.apache.hadoop.hive.ql.plan.Statistics

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.