Source Code of org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.coprocessor;


import static org.apache.phoenix.query.QueryConstants.AGG_TIMESTAMP;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN_FAMILY;


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.WritableUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import org.apache.phoenix.cache.GlobalCache;
import org.apache.phoenix.cache.TenantCache;
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.expression.ExpressionType;
import org.apache.phoenix.expression.aggregator.Aggregator;
import org.apache.phoenix.expression.aggregator.ServerAggregators;
import org.apache.phoenix.join.HashJoinInfo;
import org.apache.phoenix.memory.MemoryManager.MemoryChunk;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.schema.tuple.MultiKeyValueTuple;
import org.apache.phoenix.util.KeyValueUtil;
import org.apache.phoenix.util.ScanUtil;
import org.apache.phoenix.util.SizedUtil;
import org.apache.phoenix.util.TupleUtil;






/**
 * Region observer that aggregates grouped rows (i.e. SQL query with GROUP BY clause)
 * 
 * 
 * @since 0.1
 */
public class GroupedAggregateRegionObserver extends BaseScannerRegionObserver {
    private static final Logger logger = LoggerFactory.getLogger(GroupedAggregateRegionObserver.class);
    
    public static final String AGGREGATORS = "Aggs";
    public static final String UNORDERED_GROUP_BY_EXPRESSIONS = "UnorderedGroupByExpressions";
    public static final String KEY_ORDERED_GROUP_BY_EXPRESSIONS = "OrderedGroupByExpressions";


    public static final String ESTIMATED_DISTINCT_VALUES = "EstDistinctValues";
    public static final int DEFAULT_ESTIMATED_DISTINCT_VALUES = 10000;
    public static final int MIN_DISTINCT_VALUES = 100;
    


    /**
     * Replaces the RegionScanner s with a RegionScanner that groups by the key formed by the list of expressions from the scan
     * and returns the aggregated rows of each group.  For example, given the following original rows in the RegionScanner:
     * KEY    COL1
     * row1   a
     * row2   b
     * row3   a
     * row4   a
     * 
     * the following rows will be returned for COUNT(*):
     * KEY    COUNT
     * a      3
     * b      1
     *
     * The client is required to do a sort and a final aggregation, since multiple rows with the same key may be returned from different regions.
     */
    @Override
    protected RegionScanner doPostScannerOpen(ObserverContext<RegionCoprocessorEnvironment> c, Scan scan, RegionScanner s) throws IOException {
        boolean keyOrdered = false;
        byte[] expressionBytes = scan.getAttribute(UNORDERED_GROUP_BY_EXPRESSIONS);


        if (expressionBytes == null) {
            expressionBytes = scan.getAttribute(KEY_ORDERED_GROUP_BY_EXPRESSIONS);
            if (expressionBytes == null) {
                return s;
            }
            keyOrdered = true;
        }
        List<Expression> expressions = deserializeGroupByExpressions(expressionBytes);
        
        ServerAggregators aggregators = ServerAggregators.deserialize(
                scan.getAttribute(GroupedAggregateRegionObserver.AGGREGATORS), c.getEnvironment().getConfiguration());


        final ScanProjector p = ScanProjector.deserializeProjectorFromScan(scan);
        final HashJoinInfo j = HashJoinInfo.deserializeHashJoinFromScan(scan);        
        RegionScanner innerScanner = s;
        if (p != null || j != null) {
            innerScanner = new HashJoinRegionScanner(s, p, j, ScanUtil.getTenantId(scan), c.getEnvironment());
        }
        
        if (keyOrdered) { // Optimize by taking advantage that the rows are already in the required group by key order
            return scanOrdered(c, scan, innerScanner, expressions, aggregators);
        } else { // Otherwse, collect them all up in an in memory map
            return scanUnordered(c, scan, innerScanner, expressions, aggregators);
        }
    }


    private static int sizeOfUnorderedGroupByMap(int nRows, int valueSize) {
        return SizedUtil.sizeOfMap(nRows, SizedUtil.IMMUTABLE_BYTES_WRITABLE_SIZE, valueSize);
    }


    public static void serializeIntoScan(Scan scan, String attribName, List<Expression> groupByExpressions) {
        ByteArrayOutputStream stream = new ByteArrayOutputStream(Math.max(1, groupByExpressions.size() * 10));
        try {
            if (groupByExpressions.isEmpty()) { // FIXME ?
                stream.write(QueryConstants.TRUE);
            } else {
                DataOutputStream output = new DataOutputStream(stream);
                for (Expression expression : groupByExpressions) {
                    WritableUtils.writeVInt(output, ExpressionType.valueOf(expression).ordinal());
                    expression.write(output);
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e); // Impossible
        } finally {
            try {
                stream.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        scan.setAttribute(attribName, stream.toByteArray());


    }


    private List<Expression> deserializeGroupByExpressions(byte[] expressionBytes) throws IOException {
        List<Expression> expressions = new ArrayList<Expression>(3);
        ByteArrayInputStream stream = new ByteArrayInputStream(expressionBytes);
        try {
            DataInputStream input = new DataInputStream(stream);
            while (true) {
                try {
                    int expressionOrdinal = WritableUtils.readVInt(input);
                    Expression expression = ExpressionType.values()[expressionOrdinal].newInstance();
                    expression.readFields(input);
                    expressions.add(expression);
                } catch (EOFException e) {
                    break;
                }
            }
        } finally {
            stream.close();
        }
        return expressions;
    }
    
    /**
     * Used for an aggregate query in which the key order does not necessarily match the group by key order. In this case,
     * we must collect all distinct groups within a region into a map, aggregating as we go.
     */
    private RegionScanner scanUnordered(ObserverContext<RegionCoprocessorEnvironment> c, Scan scan, final RegionScanner s, List<Expression> expressions, ServerAggregators aggregators) throws IOException {
        
        if (logger.isDebugEnabled()) {
            logger.debug("Grouped aggregation over unordered rows with scan " + scan + ", group by " + expressions + ", aggregators " + aggregators);
        }
        int estDistVals = DEFAULT_ESTIMATED_DISTINCT_VALUES;
        byte[] estDistValsBytes = scan.getAttribute(ESTIMATED_DISTINCT_VALUES);
        if (estDistValsBytes != null) {
            estDistVals = Math.min(MIN_DISTINCT_VALUES, (int)(Bytes.toInt(estDistValsBytes) * 1.5f));  // Allocate 1.5x estimation
        }
        
        TenantCache tenantCache = GlobalCache.getTenantCache(c.getEnvironment(), ScanUtil.getTenantId(scan));
        int estSize = sizeOfUnorderedGroupByMap(estDistVals, aggregators.getSize());
        final MemoryChunk chunk = tenantCache.getMemoryManager().allocate(estSize);
        boolean success = false;
        try {
            // TODO: spool map to disk if map becomes too big
            boolean hasMore;
            int estValueSize = aggregators.getSize();
            MultiKeyValueTuple result = new MultiKeyValueTuple();
            Map<ImmutableBytesWritable, Aggregator[]> aggregateMap = new HashMap<ImmutableBytesWritable, Aggregator[]>(estDistVals);
            HRegion region = c.getEnvironment().getRegion();
            MultiVersionConsistencyControl.setThreadReadPoint(s.getMvccReadPoint());
            region.startRegionOperation();
            try {
                do {
                    List<KeyValue> results = new ArrayList<KeyValue>();
                    // Results are potentially returned even when the return value of s.next is false
                    // since this is an indication of whether or not there are more values after the
                    // ones returned
                    hasMore = s.nextRaw(results, null);
                    if (!results.isEmpty()) {
                        result.setKeyValues(results);
                        ImmutableBytesWritable key = TupleUtil.getConcatenatedValue(result, expressions);
                        Aggregator[] rowAggregators = aggregateMap.get(key);
                        if (rowAggregators == null) {
                            // If Aggregators not found for this distinct value, clone our original one (we need one per distinct value)
                            if (logger.isDebugEnabled()) {
                                logger.debug("Adding new aggregate bucket for row key " + Bytes.toStringBinary(key.get(),key.getOffset(),key.getLength()));
                            }
                            rowAggregators = aggregators.newAggregators(c.getEnvironment().getConfiguration());
                            aggregateMap.put(key, rowAggregators);
                        }
                        // Aggregate values here
                        aggregators.aggregate(rowAggregators, result);
                        if (logger.isDebugEnabled()) {
                            logger.debug("Row passed filters: " + results + ", aggregated values: " + Arrays.asList(rowAggregators));
                        }
                            
                        if (aggregateMap.size() > estDistVals) { // increase allocation
                            estDistVals *= 1.5f;
                            estSize = sizeOfUnorderedGroupByMap(estDistVals, estValueSize);
                            chunk.resize(estSize);
                        }
                    }
                } while (hasMore);
            } finally {
                region.closeRegionOperation();
            }
    
            // Compute final allocation
            estSize = sizeOfUnorderedGroupByMap(aggregateMap.size(), estValueSize);
            chunk.resize(estSize);
            
            // TODO: spool list to disk if too big and free memory?
            final List<KeyValue> aggResults = new ArrayList<KeyValue>(aggregateMap.size());
            for (Map.Entry<ImmutableBytesWritable, Aggregator[]> entry : aggregateMap.entrySet()) {
                ImmutableBytesWritable key = entry.getKey();
                Aggregator[] rowAggregators = entry.getValue();
                // Generate byte array of Aggregators and set as value of row
                byte[] value = aggregators.toBytes(rowAggregators);
                
                if (logger.isDebugEnabled()) {
                    logger.debug("Adding new distinct group: " + Bytes.toStringBinary(key.get(),key.getOffset(), key.getLength()) + 
                            " with aggregators " + Arrays.asList(rowAggregators).toString() + 
                            " value = " + Bytes.toStringBinary(value));
                }
                KeyValue keyValue = KeyValueUtil.newKeyValue(key.get(),key.getOffset(), key.getLength(),SINGLE_COLUMN_FAMILY, SINGLE_COLUMN, AGG_TIMESTAMP, value, 0, value.length);
                aggResults.add(keyValue);
            }
            // Do not sort here, but sort back on the client instead
            // The reason is that if the scan ever extends beyond a region (which can happen
            // if we're basing our parallelization split points on old metadata), we'll get
            // incorrect query results.
            RegionScanner scanner = new BaseRegionScanner() {
                private int index = 0;
    
                @Override
                public HRegionInfo getRegionInfo() {
                    return s.getRegionInfo();
                }
    
                @Override
                public void close() throws IOException {
                    try {
                        s.close();
                    } finally {
                        chunk.close();
                    }
                }
    
                @Override
                public boolean next(List<KeyValue> results) throws IOException {
                    if (index >= aggResults.size()) return false;
                    results.add(aggResults.get(index));
                    index++;
                    return index < aggResults.size();
                }
            };
            success = true;
            return scanner;
        } finally {
            if (!success)
                chunk.close();
        }
    }


    /**
     * Used for an aggregate query in which the key order match the group by key order. In this case, we can do the
     * aggregation as we scan, by detecting when the group by key changes.
     */
    private RegionScanner scanOrdered(final ObserverContext<RegionCoprocessorEnvironment> c, Scan scan, final RegionScanner s, final List<Expression> expressions, final ServerAggregators aggregators) {
        
        if (logger.isDebugEnabled()) {
            logger.debug("Grouped aggregation over ordered rows with scan " + scan + ", group by " + expressions + ", aggregators " + aggregators);
        }
        return new BaseRegionScanner() {
            private ImmutableBytesWritable currentKey = null;


            @Override
            public HRegionInfo getRegionInfo() {
                return s.getRegionInfo();
            }


            @Override
            public void close() throws IOException {
                s.close();
            }


            @Override
            public boolean next(List<KeyValue> results) throws IOException {
                boolean hasMore;
                boolean aggBoundary = false;
                MultiKeyValueTuple result = new MultiKeyValueTuple();
                ImmutableBytesWritable key = null;
                Aggregator[] rowAggregators = aggregators.getAggregators();
                HRegion region = c.getEnvironment().getRegion();
                MultiVersionConsistencyControl.setThreadReadPoint(s.getMvccReadPoint());
                region.startRegionOperation();
                try {
                    do {
                        List<KeyValue> kvs = new ArrayList<KeyValue>();
                        // Results are potentially returned even when the return value of s.next is false
                        // since this is an indication of whether or not there are more values after the
                        // ones returned
                        hasMore = s.nextRaw(kvs, null);
                        if (!kvs.isEmpty()) {
                            result.setKeyValues(kvs);
                            key = TupleUtil.getConcatenatedValue(result, expressions);
                            aggBoundary = currentKey != null && currentKey.compareTo(key) != 0;
                            if (!aggBoundary) {
                                aggregators.aggregate(rowAggregators, result);
                                if (logger.isDebugEnabled()) {
                                    logger.debug("Row passed filters: " + kvs + ", aggregated values: " + Arrays.asList(rowAggregators));
                                }
                                currentKey = key;
                            }
                        }
                    } while (hasMore && !aggBoundary);
                } finally {
                    region.closeRegionOperation();
                }
                
                if (currentKey != null) {
                    byte[] value = aggregators.toBytes(rowAggregators);
                    KeyValue keyValue = KeyValueUtil.newKeyValue(currentKey.get(),currentKey.getOffset(), currentKey.getLength(),SINGLE_COLUMN_FAMILY, SINGLE_COLUMN, AGG_TIMESTAMP, value, 0, value.length);
                    results.add(keyValue);
                    if (logger.isDebugEnabled()) {
                        logger.debug("Adding new aggregate row: " + keyValue + ",for current key " + Bytes.toStringBinary(currentKey.get(),currentKey.getOffset(), currentKey.getLength()) + ", aggregated values: " + Arrays.asList(rowAggregators));
                    }
                    // If we're at an aggregation boundary, reset the aggregators and
                    // aggregate with the current result (which is not a part of the returned result).
                    if (aggBoundary) {
                        aggregators.reset(rowAggregators);
                        aggregators.aggregate(rowAggregators, result);
                        currentKey = key;
                    }
                }
                // Continue if there are more
                if (hasMore || aggBoundary) {
                    return true;
                }
                currentKey = null;
                return false;
            }
        };
    }
}
Source Code of org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver

Related Classes of org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver