Source Code of org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver$InMemoryGroupByCache

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.coprocessor;


import static org.apache.phoenix.query.QueryConstants.AGG_TIMESTAMP;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN_FAMILY;
import static org.apache.phoenix.query.QueryServices.GROUPBY_ESTIMATED_DISTINCT_VALUES_ATTRIB;
import static org.apache.phoenix.query.QueryServices.GROUPBY_SPILLABLE_ATTRIB;
import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_GROUPBY_ESTIMATED_DISTINCT_VALUES;
import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_GROUPBY_SPILLABLE;


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.WritableUtils;
import org.apache.phoenix.cache.GlobalCache;
import org.apache.phoenix.cache.TenantCache;
import org.apache.phoenix.cache.aggcache.SpillableGroupByCache;
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.expression.ExpressionType;
import org.apache.phoenix.expression.aggregator.Aggregator;
import org.apache.phoenix.expression.aggregator.ServerAggregators;
import org.apache.phoenix.hbase.index.covered.update.ColumnReference;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.index.IndexMaintainer;
import org.apache.phoenix.join.HashJoinInfo;
import org.apache.phoenix.join.TupleProjector;
import org.apache.phoenix.memory.MemoryManager.MemoryChunk;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.schema.PDataType;
import org.apache.phoenix.schema.SortOrder;
import org.apache.phoenix.schema.tuple.MultiKeyValueTuple;
import org.apache.phoenix.util.IndexUtil;
import org.apache.phoenix.util.KeyValueUtil;
import org.apache.phoenix.util.ScanUtil;
import org.apache.phoenix.util.SizedUtil;
import org.apache.phoenix.util.TupleUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;


/**
 * Region observer that aggregates grouped rows (i.e. SQL query with GROUP BY clause)
 * 
 * @since 0.1
 */
public class GroupedAggregateRegionObserver extends BaseScannerRegionObserver {
    private static final Logger logger = LoggerFactory
            .getLogger(GroupedAggregateRegionObserver.class);
    public static final int MIN_DISTINCT_VALUES = 100;


    /**
     * Replaces the RegionScanner s with a RegionScanner that groups by the key formed by the list
     * of expressions from the scan and returns the aggregated rows of each group. For example,
     * given the following original rows in the RegionScanner: KEY COL1 row1 a row2 b row3 a row4 a
     * the following rows will be returned for COUNT(*): KEY COUNT a 3 b 1 The client is required to
     * do a sort and a final aggregation, since multiple rows with the same key may be returned from
     * different regions.
     */
    @Override
    protected RegionScanner doPostScannerOpen(ObserverContext<RegionCoprocessorEnvironment> c,
            Scan scan, RegionScanner s) throws IOException {
        boolean keyOrdered = false;
        byte[] expressionBytes = scan.getAttribute(BaseScannerRegionObserver.UNORDERED_GROUP_BY_EXPRESSIONS);


        if (expressionBytes == null) {
            expressionBytes = scan.getAttribute(BaseScannerRegionObserver.KEY_ORDERED_GROUP_BY_EXPRESSIONS);
            keyOrdered = true;
        }
        int offset = 0;
        if (ScanUtil.isLocalIndex(scan)) {
            /*
             * For local indexes, we need to set an offset on row key expressions to skip
             * the region start key.
             */
            HRegion region = c.getEnvironment().getRegion();
            offset = region.getStartKey().length != 0 ? region.getStartKey().length:region.getEndKey().length;
            ScanUtil.setRowKeyOffset(scan, offset);
        }
        
        List<Expression> expressions = deserializeGroupByExpressions(expressionBytes, 0);
        ServerAggregators aggregators =
                ServerAggregators.deserialize(scan
                        .getAttribute(BaseScannerRegionObserver.AGGREGATORS), c
                        .getEnvironment().getConfiguration());


        final TupleProjector p = TupleProjector.deserializeProjectorFromScan(scan);
        final HashJoinInfo j = HashJoinInfo.deserializeHashJoinFromScan(scan);
        long limit = Long.MAX_VALUE;
        byte[] limitBytes = scan.getAttribute(GROUP_BY_LIMIT);
        if (limitBytes != null) {
            limit = PDataType.INTEGER.getCodec().decodeInt(limitBytes, 0, SortOrder.getDefault());
        }


        RegionScanner innerScanner = s;
        if (p != null || j != null) {
            innerScanner =
                    new HashJoinRegionScanner(s, p, j, ScanUtil.getTenantId(scan),
                            c.getEnvironment());
        }
        byte[] localIndexBytes = scan.getAttribute(LOCAL_INDEX_BUILD);
        List<IndexMaintainer> indexMaintainers = localIndexBytes == null ? null : IndexMaintainer.deserialize(localIndexBytes);
        boolean localIndexScan = ScanUtil.isLocalIndex(scan);
        TupleProjector tupleProjector = null;
        HRegion dataRegion = null;
        byte[][] viewConstants = null;
        ColumnReference[] dataColumns = IndexUtil.deserializeDataTableColumnsToJoin(scan);
        if (ScanUtil.isLocalIndex(scan)) {
            if (dataColumns != null) {
                tupleProjector = IndexUtil.getTupleProjector(scan, dataColumns);
                dataRegion = IndexUtil.getDataRegion(c.getEnvironment());
                viewConstants = IndexUtil.deserializeViewConstantsFromScan(scan);
            }
        } 


        if (keyOrdered) { // Optimize by taking advantage that the rows are
                          // already in the required group by key order
            return scanOrdered(c, scan, innerScanner, expressions, aggregators, limit, offset,
                localIndexScan, dataColumns, tupleProjector, indexMaintainers, dataRegion,
                viewConstants);
        } else { // Otherwse, collect them all up in an in memory map
            return scanUnordered(c, scan, innerScanner, expressions, aggregators, limit, offset,
                localIndexScan, dataColumns, tupleProjector, indexMaintainers, dataRegion,
                viewConstants);
        }
    }


    public static long sizeOfUnorderedGroupByMap(int nRows, int valueSize) {
        return SizedUtil.sizeOfMap(nRows, SizedUtil.IMMUTABLE_BYTES_WRITABLE_SIZE, valueSize);
    }


    public static void serializeIntoScan(Scan scan, String attribName,
            List<Expression> groupByExpressions) {
        ByteArrayOutputStream stream =
                new ByteArrayOutputStream(Math.max(1, groupByExpressions.size() * 10));
        try {
            if (groupByExpressions.isEmpty()) { // FIXME ?
                stream.write(QueryConstants.TRUE);
            } else {
                DataOutputStream output = new DataOutputStream(stream);
                for (Expression expression : groupByExpressions) {
                    WritableUtils.writeVInt(output, ExpressionType.valueOf(expression).ordinal());
                    expression.write(output);
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e); // Impossible
        } finally {
            try {
                stream.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        scan.setAttribute(attribName, stream.toByteArray());


    }


    private List<Expression> deserializeGroupByExpressions(byte[] expressionBytes, int offset)
            throws IOException {
        List<Expression> expressions = new ArrayList<Expression>(3);
        ByteArrayInputStream stream = new ByteArrayInputStream(expressionBytes);
        try {
            DataInputStream input = new DataInputStream(stream);
            while (true) {
                try {
                    int expressionOrdinal = WritableUtils.readVInt(input);
                    Expression expression =
                            ExpressionType.values()[expressionOrdinal].newInstance();
                    expression.readFields(input);
                    if (offset != 0) {
                        IndexUtil.setRowKeyExpressionOffset(expression, offset);
                    }
                    expressions.add(expression);
                } catch (EOFException e) {
                    break;
                }
            }
        } finally {
            stream.close();
        }
        return expressions;
    }


    /**
     * 
     * Cache for distinct values and their aggregations which is completely
     * in-memory (as opposed to spilling to disk). Used when GROUPBY_SPILLABLE_ATTRIB
     * is set to false. The memory usage is tracked at a coursed grain and will
     * throw and abort if too much is used.
     *
     * 
     * @since 3.0.0
     */
    private static final class InMemoryGroupByCache implements GroupByCache {
        private final MemoryChunk chunk;
        private final Map<ImmutableBytesPtr, Aggregator[]> aggregateMap;
        private final ServerAggregators aggregators;
        private final RegionCoprocessorEnvironment env;
        
        private int estDistVals;
        
        InMemoryGroupByCache(RegionCoprocessorEnvironment env, ImmutableBytesWritable tenantId, ServerAggregators aggregators, int estDistVals) {
            int estValueSize = aggregators.getEstimatedByteSize();
            long estSize = sizeOfUnorderedGroupByMap(estDistVals, estValueSize);
            TenantCache tenantCache = GlobalCache.getTenantCache(env, tenantId);
            this.env = env;
            this.estDistVals = estDistVals;
            this.aggregators = aggregators;
            this.aggregateMap = Maps.newHashMapWithExpectedSize(estDistVals);
            this.chunk = tenantCache.getMemoryManager().allocate(estSize);
        }
        
        @Override
        public void close() throws IOException {
            this.chunk.close();
        }


        @Override
        public Aggregator[] cache(ImmutableBytesWritable cacheKey) {
            ImmutableBytesPtr key = new ImmutableBytesPtr(cacheKey);
            Aggregator[] rowAggregators = aggregateMap.get(key);
            if (rowAggregators == null) {
                // If Aggregators not found for this distinct
                // value, clone our original one (we need one
                // per distinct value)
                if (logger.isDebugEnabled()) {
                    logger.debug("Adding new aggregate bucket for row key "
                            + Bytes.toStringBinary(key.get(), key.getOffset(),
                                key.getLength()));
                }
                rowAggregators =
                        aggregators.newAggregators(env.getConfiguration());
                aggregateMap.put(key, rowAggregators);


                if (aggregateMap.size() > estDistVals) { // increase allocation
                    estDistVals *= 1.5f;
                    long estSize = sizeOfUnorderedGroupByMap(estDistVals, aggregators.getEstimatedByteSize());
                    chunk.resize(estSize);
                }
            }
            return rowAggregators;
        }


        @Override
        public RegionScanner getScanner(final RegionScanner s) {
            // Compute final allocation
            long estSize = sizeOfUnorderedGroupByMap(aggregateMap.size(), aggregators.getEstimatedByteSize());
            chunk.resize(estSize);


            final List<KeyValue> aggResults = new ArrayList<KeyValue>(aggregateMap.size());
            
            final Iterator<Map.Entry<ImmutableBytesPtr, Aggregator[]>> cacheIter =
                    aggregateMap.entrySet().iterator();
            while (cacheIter.hasNext()) {
                Map.Entry<ImmutableBytesPtr, Aggregator[]> entry = cacheIter.next();
                ImmutableBytesPtr key = entry.getKey();
                Aggregator[] rowAggregators = entry.getValue();
                // Generate byte array of Aggregators and set as value of row
                byte[] value = aggregators.toBytes(rowAggregators);


                if (logger.isDebugEnabled()) {
                    logger.debug("Adding new distinct group: "
                            + Bytes.toStringBinary(key.get(), key.getOffset(), key.getLength())
                            + " with aggregators " + Arrays.asList(rowAggregators).toString()
                            + " value = " + Bytes.toStringBinary(value));
                }
                KeyValue keyValue =
                        KeyValueUtil.newKeyValue(key.get(), key.getOffset(), key.getLength(),
                            SINGLE_COLUMN_FAMILY, SINGLE_COLUMN, AGG_TIMESTAMP, value, 0,
                            value.length);
                aggResults.add(keyValue);
            }
            // scanner using the non spillable, memory-only implementation
            return new BaseRegionScanner() {
                private int index = 0;


                @Override
                public HRegionInfo getRegionInfo() {
                    return s.getRegionInfo();
                }


                @Override
                public void close() throws IOException {
                    try {
                        s.close();
                    } finally {
                        InMemoryGroupByCache.this.close();
                    }
                }


                @Override
                public boolean next(List<Cell> results) throws IOException {
                    if (index >= aggResults.size()) return false;
                    results.add(aggResults.get(index));
                    index++;
                    return index < aggResults.size();
                }


                @Override
                public long getMaxResultSize() {
                  return s.getMaxResultSize();
                }
            };
        }


        @Override
        public long size() {
            return aggregateMap.size();
        }
        
    }
    private static final class GroupByCacheFactory {
        public static final GroupByCacheFactory INSTANCE = new GroupByCacheFactory();
        
        private GroupByCacheFactory() {
        }
        
        GroupByCache newCache(RegionCoprocessorEnvironment env, ImmutableBytesWritable tenantId, ServerAggregators aggregators, int estDistVals) {
            Configuration conf = env.getConfiguration();
            boolean spillableEnabled =
                    conf.getBoolean(GROUPBY_SPILLABLE_ATTRIB, DEFAULT_GROUPBY_SPILLABLE);
            if (spillableEnabled) {
                return new SpillableGroupByCache(env, tenantId, aggregators, estDistVals);
            } 
            
            return new InMemoryGroupByCache(env, tenantId, aggregators, estDistVals);
        }
    }
    /**
     * Used for an aggregate query in which the key order does not necessarily match the group by
     * key order. In this case, we must collect all distinct groups within a region into a map,
     * aggregating as we go.
     * @param limit TODO
     */
    private RegionScanner scanUnordered(ObserverContext<RegionCoprocessorEnvironment> c, Scan scan,
            final RegionScanner s, final List<Expression> expressions,
            final ServerAggregators aggregators, long limit, int offset, boolean localIndexScan,
            ColumnReference[] dataColumns, TupleProjector tupleProjector,
            List<IndexMaintainer> indexMaintainers, HRegion dataRegion, byte[][] viewConstants)
            throws IOException {
        if (logger.isDebugEnabled()) {
            logger.debug("Grouped aggregation over unordered rows with scan " + scan
                    + ", group by " + expressions + ", aggregators " + aggregators);
        }
        RegionCoprocessorEnvironment env = c.getEnvironment();
        Configuration conf = env.getConfiguration();
        int estDistVals = conf.getInt(GROUPBY_ESTIMATED_DISTINCT_VALUES_ATTRIB, DEFAULT_GROUPBY_ESTIMATED_DISTINCT_VALUES);
        byte[] estDistValsBytes = scan.getAttribute(BaseScannerRegionObserver.ESTIMATED_DISTINCT_VALUES);
        if (estDistValsBytes != null) {
            // Allocate 1.5x estimation
            estDistVals = Math.max(MIN_DISTINCT_VALUES, 
                            (int) (Bytes.toInt(estDistValsBytes) * 1.5f));
        }


        final boolean spillableEnabled =
                conf.getBoolean(GROUPBY_SPILLABLE_ATTRIB, DEFAULT_GROUPBY_SPILLABLE);


        GroupByCache groupByCache = 
                GroupByCacheFactory.INSTANCE.newCache(
                        env, ScanUtil.getTenantId(scan), 
                        aggregators, estDistVals);
        ImmutableBytesWritable tempPtr = new ImmutableBytesWritable();
        boolean success = false;
        try {
            boolean hasMore;


            MultiKeyValueTuple result = new MultiKeyValueTuple();
            if (logger.isDebugEnabled()) {
                logger.debug("Spillable groupby enabled: " + spillableEnabled);
            }


            HRegion region = c.getEnvironment().getRegion();
            region.startRegionOperation();
            try {
                do {
                    List<Cell> results = new ArrayList<Cell>();
                    // Results are potentially returned even when the return
                    // value of s.next is false
                    // since this is an indication of whether or not there are
                    // more values after the
                    // ones returned
                    hasMore = s.nextRaw(results);
                    if (!results.isEmpty()) {
                        if (localIndexScan) {
                            IndexUtil.wrapResultUsingOffset(results, offset, dataColumns, tupleProjector,
                                dataRegion, indexMaintainers == null ? null : indexMaintainers.get(0),
                                viewConstants, tempPtr);
                        }
                        result.setKeyValues(results);
                        ImmutableBytesWritable key =
                                TupleUtil.getConcatenatedValue(result, expressions);
                        Aggregator[] rowAggregators = groupByCache.cache(key);
                        // Aggregate values here
                        aggregators.aggregate(rowAggregators, result);
                    }
                } while (hasMore && groupByCache.size() < limit);
            } finally {
                region.closeRegionOperation();
            }


            RegionScanner regionScanner = groupByCache.getScanner(s);


            // Do not sort here, but sort back on the client instead
            // The reason is that if the scan ever extends beyond a region
            // (which can happen if we're basing our parallelization split
            // points on old metadata), we'll get incorrect query results.
            success = true;
            return regionScanner;
        } finally {
            if (!success) {
                Closeables.closeQuietly(groupByCache);
            }
        }
    }


    /**
     * Used for an aggregate query in which the key order match the group by key order. In this
     * case, we can do the aggregation as we scan, by detecting when the group by key changes.
     * @param limit TODO
     * @throws IOException 
     */
    private RegionScanner scanOrdered(final ObserverContext<RegionCoprocessorEnvironment> c,
            Scan scan, final RegionScanner s, final List<Expression> expressions,
            final ServerAggregators aggregators, final long limit, final int offset,
            final boolean localIndexScan, final ColumnReference[] dataColumns,
            final TupleProjector tupleProjector, final List<IndexMaintainer> indexMaintainers,
            final HRegion dataRegion, final byte[][] viewConstants) throws IOException {


        if (logger.isDebugEnabled()) {
            logger.debug("Grouped aggregation over ordered rows with scan " + scan + ", group by "
                    + expressions + ", aggregators " + aggregators);
        }
        final ImmutableBytesWritable tempPtr = new ImmutableBytesWritable();
        return new BaseRegionScanner() {
            private long rowCount = 0;
            private ImmutableBytesWritable currentKey = null;


            @Override
            public HRegionInfo getRegionInfo() {
                return s.getRegionInfo();
            }


            @Override
            public void close() throws IOException {
                s.close();
            }


            @Override
            public boolean next(List<Cell> results) throws IOException {
                boolean hasMore;
                boolean atLimit;
                boolean aggBoundary = false;
                MultiKeyValueTuple result = new MultiKeyValueTuple();
                ImmutableBytesWritable key = null;
                Aggregator[] rowAggregators = aggregators.getAggregators();
                // If we're calculating no aggregate functions, we can exit at the
                // start of a new row. Otherwise, we have to wait until an agg
                int countOffset = rowAggregators.length == 0 ? 1 : 0;
                HRegion region = c.getEnvironment().getRegion();
                region.startRegionOperation();
                try {
                    do {
                        List<Cell> kvs = new ArrayList<Cell>();
                        // Results are potentially returned even when the return
                        // value of s.next is false
                        // since this is an indication of whether or not there
                        // are more values after the
                        // ones returned
                        hasMore = s.nextRaw(kvs);
                        if (!kvs.isEmpty()) {
                            if (localIndexScan) {
                                IndexUtil.wrapResultUsingOffset(kvs, offset, dataColumns, tupleProjector,
                                    dataRegion, indexMaintainers == null ? null : indexMaintainers.get(0),
                                    viewConstants, tempPtr);
                            }
                            result.setKeyValues(kvs);
                            key = TupleUtil.getConcatenatedValue(result, expressions);
                            aggBoundary = currentKey != null && currentKey.compareTo(key) != 0;
                            if (!aggBoundary) {
                                aggregators.aggregate(rowAggregators, result);
                                if (logger.isDebugEnabled()) {
                                    logger.debug("Row passed filters: " + kvs
                                            + ", aggregated values: "
                                            + Arrays.asList(rowAggregators));
                                }
                                currentKey = key;
                            }
                        }
                        atLimit = rowCount + countOffset >= limit;
                        // Do rowCount + 1 b/c we don't have to wait for a complete
                        // row in the case of a DISTINCT with a LIMIT
                    } while (hasMore && !aggBoundary && !atLimit);
                } finally {
                    region.closeRegionOperation();
                }


                if (currentKey != null) {
                    byte[] value = aggregators.toBytes(rowAggregators);
                    KeyValue keyValue =
                            KeyValueUtil.newKeyValue(currentKey.get(), currentKey.getOffset(),
                                currentKey.getLength(), SINGLE_COLUMN_FAMILY, SINGLE_COLUMN,
                                AGG_TIMESTAMP, value, 0, value.length);
                    results.add(keyValue);
                    if (logger.isDebugEnabled()) {
                        logger.debug("Adding new aggregate row: "
                                + keyValue
                                + ",for current key "
                                + Bytes.toStringBinary(currentKey.get(), currentKey.getOffset(),
                                    currentKey.getLength()) + ", aggregated values: "
                                + Arrays.asList(rowAggregators));
                    }
                    // If we're at an aggregation boundary, reset the
                    // aggregators and
                    // aggregate with the current result (which is not a part of
                    // the returned result).
                    if (aggBoundary) {
                        aggregators.reset(rowAggregators);
                        aggregators.aggregate(rowAggregators, result);
                        currentKey = key;
                        rowCount++;
                        atLimit |= rowCount >= limit;
                    }
                }
                // Continue if there are more
                if (!atLimit && (hasMore || aggBoundary)) {
                    return true;
                }
                currentKey = null;
                return false;
            }
            
            @Override
            public long getMaxResultSize() {
                return s.getMaxResultSize();
            }
        };
    }


    @Override
    protected boolean isRegionObserverFor(Scan scan) {
        return scan.getAttribute(BaseScannerRegionObserver.UNORDERED_GROUP_BY_EXPRESSIONS) != null ||
               scan.getAttribute(BaseScannerRegionObserver.KEY_ORDERED_GROUP_BY_EXPRESSIONS) != null;
    }
}
Source Code of org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver$InMemoryGroupByCache

Related Classes of org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver$InMemoryGroupByCache