Source Code of co.cask.cdap.data2.dataset2.lib.table.ordered.BufferingOrderedTable

/*
 * Copyright © 2014 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */


package co.cask.cdap.data2.dataset2.lib.table.ordered;


import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.dataset.metrics.MeteredDataset;
import co.cask.cdap.api.dataset.table.ConflictDetection;
import co.cask.cdap.api.dataset.table.Result;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.TableSplit;
import co.cask.tephra.Transaction;
import co.cask.tephra.TransactionAware;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import javax.annotation.Nullable;


/**
 * An abstract {@link TransactionAware} implementation of {@link co.cask.cdap.api.dataset.table.OrderedTable} which
 * keeps data in memory buffer until transaction commits.
 * <p>
 * Subclasses should implement methods which deal with persistent store. This implementation merges data from persistent
 * store and in-memory buffer for read/write operations.
 * NOTE: this implementation does not allow storing null as a value of a column
 * NOTE: data fetched from persisted store should never have nulls in column values: this class doesn't check that and
 *       they could be exposed to user as nulls. At the same time since null value is not allowed by this implementation
 *       this could lead to un-expected results
 * <p>
 * This implementation assumes that the table has name and conflicts are resolved on row level.
 * <p>
 * NOTE: this implementation doesn't cache any data in-memory besides changes. I.e. if you do get of same data that is
 *       not in in-memory buffer twice, two times it will try to fetch it from persistent store.
 *       Given the snapshot isolation tx model, this can be improved in future implementations.
 * <p>
 * NOTE: current implementation persists changes only at the end of transaction. Beware of OOME. There should be better
 *       implementation for MapReduce case (YMMV though, for counters/aggregations this implementation looks sweet)
 * <p>
 * NOTE: Using {@link #get(byte[], byte[], byte[], int)} is generally always not efficient since it always hits the
 *       persisted store even if all needed data is in-memory buffer. See more info at method javadoc
 */
// todo: copying passed params to write methods may be done more efficiently: no need to copy when no changes are made
public abstract class BufferingOrderedTable extends AbstractOrderedTable implements TransactionAware, MeteredDataset {


  private static final Logger LOG = LoggerFactory.getLogger(BufferingOrderedTable.class);


  protected static final byte[] DELETE_MARKER = new byte[0];


  // name of the table
  private final String name;
  // conflict detection level
  private final ConflictDetection conflictLevel;
  // name length + name of the table: handy to have one cached
  private final byte[] nameAsTxChangePrefix;
  // Whether read-less increments should be used when increment() is called
  private final boolean enableReadlessIncrements;


  // In-memory buffer that keeps not yet persisted data. It is row->(column->value) map. Value can be null which means
  // that the corresponded column was removed.
  private NavigableMap<byte[], NavigableMap<byte[], Update>> buff;


  // Keeps track of what was persisted so far
  private NavigableMap<byte[], NavigableMap<byte[], Update>> toUndo;


  // Report data ops metrics to
  private MetricsCollector metricsCollector;


  /**
   * Creates an instance of {@link BufferingOrderedTable}.
   * @param name table name
   */
  public BufferingOrderedTable(String name) {
    this(name, ConflictDetection.ROW);
  }


  /**
   * Creates an instance of {@link BufferingOrderedTable}.
   * @param name table name
   */
  public BufferingOrderedTable(String name, ConflictDetection level) {
    this(name, level, false);
  }


  /**
   * Creates an instance of {@link BufferingOrderedTable}.
   * @param name table name
   */
  public BufferingOrderedTable(String name, ConflictDetection level, boolean enableReadlessIncrements) {
    // for optimization purposes we don't allow table name of length greater than Byte.MAX_VALUE
    Preconditions.checkArgument(name.length() < Byte.MAX_VALUE,
                                "Too big table name: " + name + ", exceeds " + Byte.MAX_VALUE);
    this.name = name;
    this.conflictLevel = level;
    this.enableReadlessIncrements = enableReadlessIncrements;
    // TODO: having central dataset management service will allow us to use table ids instead of names, which will
    //       reduce changeset size transferred to/from server
    // we want it to be of format length+value to avoid conflicts like table="ab", row="cd" vs table="abc", row="d"
    this.nameAsTxChangePrefix = Bytes.add(new byte[]{(byte) name.length()}, Bytes.toBytes(name));
    this.buff = new ConcurrentSkipListMap<byte[], NavigableMap<byte[], Update>>(Bytes.BYTES_COMPARATOR);
  }


  /**
   * @return name of this table
   */
  public String getTableName() {
    return name;
  }


  @Override
  public String getTransactionAwareName() {
    return getClass().getSimpleName() + "(table = " + name + ")";
  }


  /**
   * Persists in-memory buffer. After this method returns we assume that data can be visible to other table clients
   * (of course other clients may choose still not to see it based on transaction isolation logic).
   * @param buff in-memory buffer to persist. Map is described as row->(column->value). Map can contain null values
   *             which means that the corresponded column was deleted
   * @throws Exception
   */
  protected abstract void persist(NavigableMap<byte[], NavigableMap<byte[], Update>> buff)
    throws Exception;


  /**
   * Undos previously persisted changes. After this method returns we assume that data can be visible to other table
   * clients (of course other clients may choose still not to see it based on transaction isolation logic).
   * @param persisted previously persisted changes. Map is described as row->(column->value). Map can contain null
   *                  values which means that the corresponded column was deleted
   * @throws Exception
   */
  protected abstract void undo(NavigableMap<byte[], NavigableMap<byte[], Update>> persisted)
    throws Exception;


  /**
   * Fetches column->value pairs for set of columns from persistent store.
   * NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
   * @param row row key defines the row to fetch columns from
   * @param columns set of columns to fetch, can be null which means fetch everything
   * @return map of column->value pairs, never null.
   * @throws Exception
   */
  protected abstract NavigableMap<byte[], byte[]> getPersisted(byte[] row, @Nullable byte[][] columns)
    throws Exception;


  /**
   * Fetches column->value pairs for range of columns from persistent store.
   * NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
   * NOTE: Using this method is generally always not efficient since it always hits the
   *       persisted store even if all needed data is in-memory buffer. Since columns set is not strictly defined the
   *       implementation always looks up for more columns in persistent store.
   * @param row row key defines the row to fetch columns from
   * @param startColumn first column in a range, inclusive
   * @param stopColumn last column in a range, exclusive
   * @param limit max number of columns to fetch
   * @return map of column->value pairs, never null.
   * @throws Exception
   */
  protected abstract NavigableMap<byte[], byte[]> getPersisted(byte[] row,
                                                               byte[] startColumn, byte[] stopColumn,
                                                               int limit)
    throws Exception;


  /**
   * Scans range of rows from persistent store.
   * NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
   * @param startRow key of the first row in a range, inclusive
   * @param stopRow key of the last row in a range, exclusive
   * @return instance of {@link Scanner}, never null
   * @throws Exception
   */
  protected abstract  Scanner scanPersisted(byte[] startRow, byte[] stopRow) throws Exception;


  @Override
  public void setMetricsCollector(MetricsCollector metricsCollector) {
    this.metricsCollector = metricsCollector;
  }


  @Override
  public void close() {
    // releasing resources
    buff = null;
    toUndo = null;
  }


  @Override
  public void startTx(Transaction tx) {
    if (buff == null) {
      String msg = "Attempted to use closed dataset " + getTransactionAwareName();
      LOG.error(msg);
      throw new IllegalStateException(msg);
    }
    // starting with fresh buffer when tx starts
    buff.clear();
    toUndo = null;
  }


  @Override
  public Collection<byte[]> getTxChanges() {
    switch (conflictLevel) {
      case NONE:
        return Collections.emptyList();
      case ROW:
        return getRowChanges();
      case COLUMN:
        return getColumnChanges();
      default:
        throw new RuntimeException("Unknown conflict detection level: " + conflictLevel);
    }
  }


  private Collection<byte[]> getRowChanges() {
    // we resolve conflicts on row level of individual table
    List<byte[]> changes = new ArrayList<byte[]>(buff.size());
    for (byte[] changedRow : buff.keySet()) {
      changes.add(Bytes.add(nameAsTxChangePrefix, changedRow));
    }
    return changes;
  }


  private Collection<byte[]> getColumnChanges() {
    // we resolve conflicts on row level of individual table
    List<byte[]> changes = new ArrayList<byte[]>(buff.size());
    for (Map.Entry<byte[], NavigableMap<byte[], Update>> rowChange : buff.entrySet()) {
      if (rowChange.getValue() == null) {
        // NOTE: as of now we cannot detect conflict between delete whole row and row's column value change.
        //       this is not a big problem as of now, as row deletion is now act as deletion of every column, but this
        //       will change in future, so we will have to address the issue.
        continue;
      }


      // using length + value format to prevent conflicts like row="ab", column="cd" vs row="abc", column="d"
      byte[] rowTxChange = Bytes.add(Bytes.toBytes(rowChange.getKey().length), rowChange.getKey());


      for (byte[] column : rowChange.getValue().keySet()) {
        changes.add(Bytes.add(nameAsTxChangePrefix, rowTxChange, column));
      }
    }
    return changes;
  }


  @Override
  public boolean commitTx() throws Exception {
    if (!buff.isEmpty()) {
      // We first assume that all data will be persisted. So that if exception happen during persist we try to
      // rollback everything we had in in-memory buffer.
      toUndo = buff;
      // clearing up in-memory buffer by initializing new map.
      // NOTE: we want to init map here so that if no changes are made we re-use same instance of the map in next tx
      // NOTE: we could cache two maps and swap them to avoid creation of map instances, but code would be ugly
      buff = new ConcurrentSkipListMap<byte[], NavigableMap<byte[], Update>>(Bytes.BYTES_COMPARATOR);
      // TODO: tracking of persisted items can be optimized by returning a pair {succeededOrNot, persisted} which
      //       tells if persisting succeeded and what was persisted (i.e. what we will have to undo in case of rollback)
      persist(toUndo);
    }
    return true;
  }


  @Override
  public void postTxCommit() {
    // don't need buffer anymore: tx has been committed
    buff.clear();
    toUndo = null;
  }


  @Override
  public boolean rollbackTx() throws Exception {
    buff.clear();
    if (toUndo != null) {
      undo(toUndo);
      toUndo = null;
    }
    return true;
  }


  /**
   * NOTE: Depending on the use-case, calling this method may be much less
   *       efficient than calling same method with columns as parameters because it may always require round trip to
   *       persistent store
   */
  @Override
  public Map<byte[], byte[]> get(byte[] row) throws Exception {
    reportRead(1);
    return getRowMap(row);
  }


  @Override
  public Map<byte[], byte[]> get(byte[] row, byte[][] columns) throws Exception {
    reportRead(1);
    return getRowMap(row, columns);
  }


  @Override
  public Map<byte[], byte[]> get(byte[] row, byte[] startColumn, byte[] stopColumn, int limit)
    throws Exception {
    reportRead(1);
    // checking if the row was deleted inside this tx
    NavigableMap<byte[], Update> buffCols = buff.get(row);
    boolean rowDeleted = buffCols == null && buff.containsKey(row);
    // ANDREAS: can this ever happen?
    if (rowDeleted) {
      return Collections.emptyMap();
    }


    // NOTE: since we cannot tell the exact column set, we always have to go to persisted store.
    //       potential improvement: do not fetch columns available in in-mem buffer (we know them at this point)
    Map<byte[], byte[]> persistedCols = getPersisted(row, startColumn, stopColumn, limit);


    // adding server cols, and then overriding with buffered values
    NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
    if (persistedCols != null) {
      result.putAll(persistedCols);
    }


    if (buffCols != null) {
      buffCols = getRange(buffCols, startColumn, stopColumn, limit);
      // null valued columns in in-memory buffer are deletes, so we need to delete them from the result list
      mergeToPersisted(result, buffCols, null);
    }


    // applying limit
    return head(result, limit);
  }


  /**
   * NOTE: if value is null corresponded column is deleted. It will not be in result set when reading.
   *
   * Also see {@link co.cask.cdap.api.dataset.table.OrderedTable#put(byte[], byte[][], byte[][])}.
   */
  @Override
  public void put(byte[] row, byte[][] columns, byte[][] values) throws Exception {
    reportWrite(1, getSize(row) + getSize(columns) + getSize(values));
    NavigableMap<byte[], Update> colVals = buff.get(row);
    if (colVals == null) {
      colVals = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
      // NOTE: we copy passed row's byte arrays to protect buffer against possible changes of this array on client
      buff.put(copy(row), colVals);
    }
    for (int i = 0; i < columns.length; i++) {
      // NOTE: we copy passed column's and value's byte arrays to protect buffer against possible changes of these
      // arrays on client
      colVals.put(copy(columns[i]), new PutValue(copy(values[i])));
    }
  }


  /**
   * NOTE: Depending on the use-case, calling this method may be much less
   *       efficient than calling same method with columns as parameters because it may always require round trip to
   *       persistent store
   */
  @Override
  public void delete(byte[] row) throws Exception {
    // "0" because we don't know what gets deleted
    reportWrite(1, 0);
    // this is going to be expensive, but the only we can do as delete implementation act on per-column level
    Map<byte[], byte[]> rowMap = getRowMap(row);
    delete(row, rowMap.keySet().toArray(new byte[rowMap.keySet().size()][]));
  }


  @Override
  public void delete(byte[] row, byte[][] columns) throws Exception {
    if (columns == null) {
      delete(row);
      return;
    }


    // Do not delete anything when columns list is empty. Return-fast shortcut
    if (columns.length == 0) {
      return;
    }


    // "0" because we don't know what gets deleted
    reportWrite(1, 0);
    // same as writing null for every column
    // ANDREAS: shouldn't this be DELETE_MARKER?
    put(row, columns, new byte[columns.length][]);
  }


  @Override
  public Map<byte[], Long> incrementAndGet(byte[] row, byte[][] columns, long[] amounts) throws Exception {
    reportRead(1);
    reportWrite(1, getSize(row) + getSize(columns) + getSize(amounts));
    // Logic:
    // * fetching current values
    // * updating values
    // * updating in-memory store
    // * returning updated values as result
    // NOTE: there is more efficient way to do it, but for now we want more simple implementation, not over-optimizing
    Map<byte[], byte[]> rowMap = getRowMap(row, columns);
    byte[][] updatedValues = new byte[columns.length][];


    NavigableMap<byte[], Long> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
    for (int i = 0; i < columns.length; i++) {
      byte[] column = columns[i];
      byte[] val = rowMap.get(column);
      // converting to long
      long longVal;
      if (val == null) {
        longVal = 0L;
      } else {
        if (val.length != Bytes.SIZEOF_LONG) {
          throw new NumberFormatException("Attempted to increment a value that is not convertible to long," +
                                            " row: " + Bytes.toStringBinary(row) +
                                            " column: " + Bytes.toStringBinary(column));
        }
        longVal = Bytes.toLong(val);
      }
      longVal += amounts[i];
      updatedValues[i] = Bytes.toBytes(longVal);
      result.put(column, longVal);
    }


    put(row, columns, updatedValues);


    return result;
  }


  @Override
  public void increment(byte[] row, byte[][] columns, long[] amounts) throws Exception {
    if (enableReadlessIncrements) {
      reportWrite(1, getSize(row) + getSize(columns) + getSize(amounts));
      NavigableMap<byte[], Update> colVals = buff.get(row);
      if (colVals == null) {
        colVals = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
        buff.put(row, colVals);
      }
      for (int i = 0; i < columns.length; i++) {
        colVals.put(columns[i], Updates.mergeUpdates(colVals.get(columns[i]), new IncrementValue(amounts[i])));
      }
    } else {
      incrementAndGet(row, columns, amounts);
    }
  }


  @Override
  public boolean compareAndSwap(byte[] row, byte[] column, byte[] expectedValue, byte[] newValue) throws Exception {
    reportRead(1);
    reportWrite(1, getSize(row) + getSize(column) + getSize(newValue));
    // NOTE: there is more efficient way to do it, but for now we want more simple implementation, not over-optimizing
    byte[][] columns = new byte[][]{column};
    byte[] currentValue = getRowMap(row, columns).get(column);
    if (Arrays.equals(expectedValue, currentValue)) {
      put(row, columns, new byte[][]{newValue});
      return true;
    }


    return false;
  }


  /**
   * Fallback implementation of getSplits, {@link SplitsUtil#primitiveGetSplits(int, byte[], byte[])}.
   * Ideally should be overridden by subclasses
   */
  @Override
  public List<Split> getSplits(int numSplits, byte[] start, byte[] stop) {
    List<KeyRange> keyRanges = SplitsUtil.primitiveGetSplits(numSplits, start, stop);
    return Lists.transform(keyRanges, new Function<KeyRange, Split>() {
      @Nullable
      @Override
      public Split apply(@Nullable KeyRange input) {
        return new TableSplit(input == null ? null : input.getStart(),
                                           input == null ? null : input.getStop());
      }
    });
  }


  @Override
  public Scanner scan(byte[] startRow, byte[] stopRow) throws Exception {
    NavigableMap<byte[], NavigableMap<byte[], Update>> bufferMap;
    if (startRow == null && stopRow == null) {
      bufferMap = buff;
    } else if (startRow == null) {
      bufferMap = buff.headMap(stopRow, false);
    } else if (stopRow == null) {
      bufferMap = buff.tailMap(startRow, true);
    } else {
      bufferMap = buff.subMap(startRow, true, stopRow, false);
    }
    return new BufferingScanner(bufferMap, scanPersisted(startRow, stopRow));
  }


  private Map<byte[], byte[]> getRowMap(byte[] row) throws Exception {
    NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
    // checking if the row was deleted inside this tx
    NavigableMap<byte[], Update> buffCols = buff.get(row);
    boolean rowDeleted = buffCols == null && buff.containsKey(row);
    if (rowDeleted) {
      return Collections.emptyMap();
    }


    Map<byte[], byte[]> persisted = getPersisted(row, null);




    result.putAll(persisted);
    if (buffCols != null) {
      // buffered should override those returned from persistent store
      mergeToPersisted(result, buffCols, null);
    }


    return unwrapDeletes(result);
  }


  private Map<byte[], byte[]> getRowMap(byte[] row, byte[][] columns) throws Exception {
    NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
    // checking if the row was deleted inside this tx
    NavigableMap<byte[], Update> buffCols = buff.get(row);
    boolean rowDeleted = buffCols == null && buff.containsKey(row);
    if (rowDeleted) {
      return Collections.emptyMap();
    }


    // if nothing locally, return all from server
    if (buffCols == null) {
      return getPersisted(row, columns);
    }


    // otherwise try to fetch data from in-memory buffer. If not all present - fetch leftover from persisted
    List<byte[]> colsToFetchFromPersisted = Lists.newArrayList();
    // try to fetch from local buffer first and then from server if it is not in buffer
    for (byte[] column : columns) {
      if (!buffCols.containsKey(column)) {
        colsToFetchFromPersisted.add(column);
        continue;
      }


      Update val = buffCols.get(column);
      // buffered increments will need to the applied on top of the persisted values
      if (val instanceof IncrementValue) {
        colsToFetchFromPersisted.add(column);
      }
    }


    // fetching from server those that were not found in in-mem buffer
    if (colsToFetchFromPersisted.size() > 0) {
      Map<byte[], byte[]> persistedCols =
        getPersisted(row, colsToFetchFromPersisted.toArray(new byte[colsToFetchFromPersisted.size()][]));
      if (persistedCols != null) {
        result.putAll(persistedCols);
      }
    }


    // overlay buffered values on persisted, applying increments where necessary
    mergeToPersisted(result, buffCols, columns);


    return unwrapDeletes(result);
  }


  /**
   * Applies the buffered updates on top of the map of persisted values.  The persisted map is modified in place
   * with the updated values.
   * @param persisted The map to modify with the buffered values.
   * @param buffered The buffered values to overlay on the persisted map.
   */
  private static void mergeToPersisted(Map<byte[], byte[]> persisted, Map<byte[], Update> buffered, byte[][] columns) {
    List<byte[]> columnKeys;
    if (columns != null) {
      columnKeys = Arrays.asList(columns);
    } else {
      // NOTE: we want to copy key's byte array because it may be leaked to table's client and we don't want client
      //       to affect the buffer by changing it in place
      columnKeys = Lists.newArrayListWithExpectedSize(buffered.size());
      for (byte[] key : buffered.keySet()) {
        columnKeys.add(copy(key));
      }
    }
    // overlay buffered values on persisted, applying increments where necessary
    for (byte[] key : columnKeys) {
      Update val = buffered.get(key);
      if (val == null) {
        if (buffered.containsKey(key)) {
          persisted.remove(key);
        }
      } else if (val instanceof IncrementValue) {
        long persistedValue = 0L;
        byte[] persistedBytes = persisted.get(key);
        if (persistedBytes != null) {
          persistedValue = Bytes.toLong(persistedBytes);
        }
        long newValue = persistedValue + ((IncrementValue) val).getValue();
        persisted.put(key, Bytes.toBytes(newValue));
      } else if (val instanceof PutValue) {
        // overwrite the current
        // NOTE: we want to copy value's byte array because it may be leaked to table's client and we don't want client
        // to affect the buffer by changing it in place
        persisted.put(key, copy(((PutValue) val).getValue()));
      }
      // unknown type?!
    }
  }


  // utilities useful for underlying implementations


  protected static <T> NavigableMap<byte[], T> getRange(NavigableMap<byte[], T> rowMap,
                                                         byte[] startColumn, byte[] stopColumn,
                                                         int limit) {
    NavigableMap<byte[], T> result;
    if (startColumn == null && stopColumn == null) {
      result = rowMap;
    } else if (startColumn == null) {
      result = rowMap.headMap(stopColumn, false);
    } else if (stopColumn == null) {
      result = rowMap.tailMap(startColumn, true);
    } else {
      result = rowMap.subMap(startColumn, true, stopColumn, false);
    }
    return head(result, limit);
  }


  protected static <T> NavigableMap<byte[], T> head(NavigableMap<byte[], T> map, int count) {
    if (count > 0 && map.size() > count) {
      // todo: is there better way to do it?
      byte [] lastToInclude = null;
      int i = 0;
      for (Map.Entry<byte[], T> entry : map.entrySet()) {
        lastToInclude = entry.getKey();
        if (++i >= count) {
          break;
        }
      }
      map = map.headMap(lastToInclude, true);
    }


    return map;
  }


  protected static byte[] wrapDeleteIfNeeded(byte[] value) {
    return value == null ? DELETE_MARKER : value;
  }


  protected static byte[] unwrapDeleteIfNeeded(byte[] value) {
    return Arrays.equals(DELETE_MARKER, value) ? null : value;
  }


  // todo: it is in-efficient to copy maps a lot, consider merging with getLatest methods
  protected static NavigableMap<byte[], NavigableMap<byte[], byte[]>> unwrapDeletesForRows(
    NavigableMap<byte[], NavigableMap<byte[], byte[]>> rows) {


    NavigableMap<byte[], NavigableMap<byte[], byte[]>> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
    for (Map.Entry<byte[], NavigableMap<byte[], byte[]>> row : rows.entrySet()) {
      NavigableMap<byte[], byte[]> rowMap = unwrapDeletes(row.getValue());
      if (rowMap.size() > 0) {
        result.put(row.getKey(), rowMap);
      }
    }


    return result;
  }


  // todo: it is in-efficient to copy maps a lot, consider merging with getLatest methods
  protected static NavigableMap<byte[], byte[]> unwrapDeletes(NavigableMap<byte[], byte[]> rowMap) {
    if (rowMap == null || rowMap.isEmpty()) {
      return EMPTY_ROW_MAP;
    }
    NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
    for (Map.Entry<byte[], byte[]> keyVal : rowMap.entrySet()) {
      byte[] val = unwrapDeleteIfNeeded(keyVal.getValue());
      if (val != null) {
        result.put(keyVal.getKey(), val);
      }
    }
    return result;
  }


  private void reportWrite(int numOps, int dataSize) {
    if (metricsCollector != null) {
      metricsCollector.recordWrite(numOps, dataSize);
    }
  }


  private void reportRead(int numOps) {
    if (metricsCollector != null) {
      // todo: report amount of data being read
      metricsCollector.recordRead(numOps, 0);
    }
  }


  private int getSize(long[] values) {
    return Bytes.SIZEOF_LONG * values.length;
  }


  private static int getSize(byte[][] data) {
    int size = 0;
    for (byte[] item : data) {
      size += getSize(item);
    }


    return size;
  }


  private static int getSize(byte[] item) {
    return item == null ? 0 : item.length;
  }


  private static byte[] copy(byte[] bytes) {
    return bytes == null ? null : Arrays.copyOf(bytes, bytes.length);
  }


  /**
   * Scanner implementation that overlays buffered data on top of already persisted data.
   */
  private class BufferingScanner implements Scanner {
    private final NavigableMap<byte[], NavigableMap<byte[], Update>> buffer;
    private final Scanner persistedScanner;
    private final Iterator<byte[]> keyIter;
    private byte[] currentKey;
    private Row currentRow;


    private BufferingScanner(NavigableMap<byte[], NavigableMap<byte[], Update>> buffer, Scanner persistedScanner) {
      this.buffer = buffer;
      this.keyIter = this.buffer.keySet().iterator();
      if (this.keyIter.hasNext()) {
        currentKey = keyIter.next();
      }
      this.persistedScanner = persistedScanner;
      this.currentRow = this.persistedScanner.next();
    }


    @Nullable
    @Override
    public Row next() {
      if (currentKey == null && currentRow == null) {
        // out of rows
        return null;
      }
      int order;
      if (currentKey == null) {
        // exhausted buffer is the same as persisted scan row coming first
        order = 1;
      } else if (currentRow == null) {
        // exhausted persisted scanner is the same as buffer row coming first
        order = -1;
      } else {
        order = Bytes.compareTo(currentKey, currentRow.getRow());
      }
      Row result = null;
      if (order > 0) {
        // persisted row comes first or buffer is empty
        result = currentRow;
        currentRow = persistedScanner.next();
      } else if (order < 0) {
        // buffer row comes first or persisted scanner is empty
        Map<byte[], byte[]> persistedRow = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
        mergeToPersisted(persistedRow, buffer.get(currentKey), null);
        result = new Result(copy(currentKey), persistedRow);


        currentKey = keyIter.hasNext() ? keyIter.next() : null;
      } else {
        // if currentKey and currentRow are equal, merge and advance both
        Map<byte[], byte[]> persisted = currentRow.getColumns();
        mergeToPersisted(persisted, buffer.get(currentKey), null);
        result = new Result(currentRow.getRow(), persisted);


        currentRow = persistedScanner.next();
        currentKey = keyIter.hasNext() ? keyIter.next() : null;
      }
      return result;
    }


    @Override
    public void close() {
      this.persistedScanner.close();
    }
  }
}
Source Code of co.cask.cdap.data2.dataset2.lib.table.ordered.BufferingOrderedTable

Related Classes of co.cask.cdap.data2.dataset2.lib.table.ordered.BufferingOrderedTable