Package net.opentsdb.core

Source Code of net.opentsdb.core.CompactionQueue$Thrd

// This file is part of OpenTSDB.
// Copyright (C) 2011-2012  The OpenTSDB Authors.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 2.1 of the License, or (at your
// option) any later version.  This program is distributed in the hope that it
// will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
// General Public License for more details.  You should have received a copy
// of the GNU Lesser General Public License along with this program.  If not,
// see <http://www.gnu.org/licenses/>.
package net.opentsdb.core;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import com.stumbleupon.async.Callback;
import com.stumbleupon.async.Deferred;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.hbase.async.Bytes;
import org.hbase.async.HBaseRpc;
import org.hbase.async.KeyValue;
import org.hbase.async.PleaseThrottleException;

import net.opentsdb.core.Internal.Cell;
import net.opentsdb.meta.Annotation;
import net.opentsdb.stats.StatsCollector;
import net.opentsdb.utils.JSON;

/**
* "Queue" of rows to compact.
* <p>
* Whenever we write a data point to HBase, the row key we write to is added
* to this queue, which is effectively a sorted set.  There is a separate
* thread that periodically goes through the queue and look for "old rows" to
* compact.  A row is considered "old" if the timestamp in the row key is
* older than a certain threshold.
* <p>
* The compaction process consists in reading all the cells within a given row
* and writing them back out as a single big cell.  Once that writes succeeds,
* we delete all the individual little cells.
* <p>
* This process is effective because in HBase the row key is repeated for
* every single cell.  And because there is no way to efficiently append bytes
* at the end of a cell, we have to do this instead.
*/
final class CompactionQueue extends ConcurrentSkipListMap<byte[], Boolean> {

  private static final Logger LOG = LoggerFactory.getLogger(CompactionQueue.class);

  /** Used to sort individual columns from a data row */
  private static final Internal.KeyValueComparator COMPARATOR =
    new Internal.KeyValueComparator();
 
  /**
   * How many items are currently in the queue.
   * Because {@link ConcurrentSkipListMap#size} has O(N) complexity.
   */
  private final AtomicInteger size = new AtomicInteger();

  private final AtomicLong trivial_compactions = new AtomicLong();
  private final AtomicLong complex_compactions = new AtomicLong();
  private final AtomicLong written_cells = new AtomicLong();
  private final AtomicLong deleted_cells = new AtomicLong();

  /** The {@code TSDB} instance we belong to. */
  private final TSDB tsdb;

  /** On how many bytes do we encode metrics IDs.  */
  private final short metric_width;

  /**
   * Constructor.
   * @param tsdb The TSDB we belong to.
   */
  public CompactionQueue(final TSDB tsdb) {
    super(new Cmp(tsdb));
    this.tsdb = tsdb;
    metric_width = tsdb.metrics.width();
    if (tsdb.config.enable_compactions()) {
      startCompactionThread();
    }
  }

  @Override
  public int size() {
    return size.get();
  }

  public void add(final byte[] row) {
    if (super.put(row, Boolean.TRUE) == null) {
      size.incrementAndGet()// We added a new entry, count it.
    }
  }

  /**
   * Forces a flush of the all old entries in the compaction queue.
   * @return A deferred that will be called back once everything has been
   * flushed (or something failed, in which case the deferred will carry the
   * exception).  In case of success, the kind of object returned is
   * unspecified.
   */
  public Deferred<ArrayList<Object>> flush() {
    final int size = size();
    if (size > 0) {
      LOG.info("Flushing all old outstanding rows out of " + size + " rows");
    }
    final long now = System.currentTimeMillis();
    return flush(now / 1000 - Const.MAX_TIMESPAN - 1, Integer.MAX_VALUE);
  }

  /**
   * Collects the stats and metrics tracked by this instance.
   * @param collector The collector to use.
   */
  void collectStats(final StatsCollector collector) {
    collector.record("compaction.count", trivial_compactions, "type=trivial");
    collector.record("compaction.count", complex_compactions, "type=complex");
    if (!tsdb.config.enable_compactions()) {
      return;
    }
    // The remaining stats only make sense with compactions enabled.
    collector.record("compaction.queue.size", size);
    collector.record("compaction.errors", handle_read_error.errors, "rpc=read");
    collector.record("compaction.errors", handle_write_error.errors, "rpc=put");
    collector.record("compaction.errors", handle_delete_error.errors,
                     "rpc=delete");
    collector.record("compaction.writes", written_cells);
    collector.record("compaction.deletes", deleted_cells);
  }

  /**
   * Flushes all the rows in the compaction queue older than the cutoff time.
   * @param cut_off A UNIX timestamp in seconds (unsigned 32-bit integer).
   * @param maxflushes How many rows to flush off the queue at once.
   * This integer is expected to be strictly positive.
   * @return A deferred that will be called back once everything has been
   * flushed.
   */
  private Deferred<ArrayList<Object>> flush(final long cut_off, int maxflushes) {
    assert maxflushes > 0: "maxflushes must be > 0, but I got " + maxflushes;
    // We can't possibly flush more entries than size().
    maxflushes = Math.min(maxflushes, size());
    if (maxflushes == 0) {  // Because size() might be 0.
      return Deferred.fromResult(new ArrayList<Object>(0));
    }
    final ArrayList<Deferred<Object>> ds =
      new ArrayList<Deferred<Object>>(Math.min(maxflushes,
                                               MAX_CONCURRENT_FLUSHES));
    int nflushes = 0;
    int seed = (int) (System.nanoTime() % 3);
    for (final byte[] row : this.keySet()) {
      if (maxflushes == 0) {
        break;
      }
      if (seed == row.hashCode() % 3) {
        continue;
      }
      final long base_time = Bytes.getUnsignedInt(row, metric_width);
      if (base_time > cut_off) {
        break;
      } else if (nflushes == MAX_CONCURRENT_FLUSHES) {
        // We kicked off the compaction of too many rows already, let's wait
        // until they're done before kicking off more.
        break;
      }
      // You'd think that it would be faster to grab an iterator on the map
      // and then call remove() on the iterator to "unlink" the element
      // directly from where the iterator is at, but no, the JDK implements
      // it by calling remove(key) so it has to lookup the key again anyway.
      if (super.remove(row) == null) {  // We didn't remove anything.
        continue// So someone else already took care of this entry.
      }
      nflushes++;
      maxflushes--;
      size.decrementAndGet();
      ds.add(tsdb.get(row).addCallbacks(compactcb, handle_read_error));
    }
    final Deferred<ArrayList<Object>> group = Deferred.group(ds);
    if (nflushes == MAX_CONCURRENT_FLUSHES && maxflushes > 0) {
      // We're not done yet.  Once this group of flushes completes, we need
      // to kick off more.
      tsdb.flush()// Speed up this batch by telling the client to flush.
      final int maxflushez = maxflushes;  // Make it final for closure.
      final class FlushMoreCB implements Callback<Deferred<ArrayList<Object>>,
                                                  ArrayList<Object>> {
        public Deferred<ArrayList<Object>> call(final ArrayList<Object> arg) {
          return flush(cut_off, maxflushez);
        }
        public String toString() {
          return "Continue flushing with cut_off=" + cut_off
            + ", maxflushes=" + maxflushez;
        }
      }
      group.addCallbackDeferring(new FlushMoreCB());
    }
    return group;
  }

  private final CompactCB compactcb = new CompactCB();

  /**
   * Callback to compact a row once it's been read.
   * <p>
   * This is used once the "get" completes, to actually compact the row and
   * write back the compacted version.
   */
  private final class CompactCB implements Callback<Object, ArrayList<KeyValue>> {
    public Object call(final ArrayList<KeyValue> row) {
      return compact(row, null);
    }
    public String toString() {
      return "compact";
    }
  }

  /**
   * Compacts a row into a single {@link KeyValue}.
   * @param row The row containing all the KVs to compact.
   * Must contain at least one element.
   * @return A compacted version of this row.
   */
  KeyValue compact(final ArrayList<KeyValue> row,
      List<Annotation> annotations) {
    final KeyValue[] compacted = { null };
    compact(row, compacted, annotations);
    return compacted[0];
  }

  /**
   * Compacts a row into a single {@link KeyValue}.
   * <p>
   * If the {@code row} is empty, this function does literally nothing.
   * If {@code compacted} is not {@code null}, then the compacted form of this
   * {@code row} will be stored in {@code compacted[0]}.  Obviously, if the
   * {@code row} contains a single cell, then that cell is the compacted form.
   * Otherwise the compaction process takes places.
   * @param row The row containing all the KVs to compact.  Must be non-null.
   * @param compacted If non-null, the first item in the array will be set to
   * a {@link KeyValue} containing the compacted form of this row.
   * If non-null, we will also not write the compacted form back to HBase
   * unless the timestamp in the row key is old enough.
   * @return A {@link Deferred} if the compaction processed required a write
   * to HBase, otherwise {@code null}.
   */
  private Deferred<Object> compact(final ArrayList<KeyValue> row,
                                   final KeyValue[] compacted,
                                   List<Annotation> annotations) {
    if (row.size() <= 1) {
      if (row.isEmpty()) {  // Maybe the row got deleted in the mean time?
        LOG.debug("Attempted to compact a row that doesn't exist.");
      } else if (compacted != null) {
        // no need to re-compact rows containing a single value.
        KeyValue kv = row.get(0);
        final byte[] qual = kv.qualifier();
        if (qual.length % 2 != 0 || qual.length == 0) {
          // This could be a row with only an annotation in it
          if ((qual[0] | Annotation.PREFIX()) == Annotation.PREFIX()) {
            final Annotation note = JSON.parseToObject(kv.value(),
                Annotation.class);
            annotations.add(note);
          }
          return null;
        }
        final byte[] val = kv.value();
        if (qual.length == 2 && Internal.floatingPointValueToFix(qual[1], val)) {
          // Fix up old, incorrectly encoded floating point value.
          final byte[] newval = Internal.fixFloatingPointValue(qual[1], val);
          final byte[] newqual = new byte[] { qual[0],
            Internal.fixQualifierFlags(qual[1], newval.length) };
          kv = new KeyValue(kv.key(), kv.family(), newqual, newval);
        }
        compacted[0] = kv;
      }
      return null;
    }

    // We know we have at least 2 cells.  We need to go through all the cells
    // to determine what kind of compaction we're going to do.  If each cell
    // contains a single individual data point, then we can do a trivial
    // compaction.  Otherwise, we have a partially compacted row, and the
    // logic required to compact it is more complex.
    boolean write = true// Do we need to write a compacted cell?
    final KeyValue compact;
    {
      boolean trivial = true// Are we doing a trivial compaction?
      boolean ms_in_row = false;
      boolean s_in_row = false;
      int qual_len = 0// Pre-compute the size of the qualifier we'll need.
      int val_len = 1;   // Reserve an extra byte for meta-data.
      KeyValue longest = row.get(0)// KV with the longest qualifier.
      int longest_idx = 0;            // Index of `longest'.
      int nkvs = row.size();
      for (int i = 0; i < nkvs; i++) {
        final KeyValue kv = row.get(i);
        final byte[] qual = kv.qualifier();
        // If the qualifier length isn't 2, this row might have already
        // been compacted, potentially partially, so we need to merge the
        // partially compacted set of cells, with the rest.
        final int len = qual.length;
        if (len != 2 && len != 4) {
          // Datapoints and compacted columns should have qualifiers with an
          // even number of bytes. If we find one with an odd number, or an
          // empty qualifier (which is possible), we need to remove it from the
          // compaction queue.
          if (len % 2 != 0 || len == 0) {
            // if the qualifier is 3 bytes and starts with the Annotation prefix,
            // parse it out.
            if ((qual[0] | Annotation.PREFIX()) == Annotation.PREFIX()) {
              final Annotation note = JSON.parseToObject(kv.value(),
                  Annotation.class);
              annotations.add(note);
            }
           
            row.remove(i)// This is O(n) but should happen *very* rarely.
            nkvs--;
            i--;
            continue;
          }
          trivial = false;
          // We only do this here because no qualifier can be < 2 bytes.
          if (len > longest.qualifier().length) {
            longest = kv;
            longest_idx = i;
          }
         
          // we need to check the value meta flag to see if the already compacted
          // column has a mixture of second and millisecond timestamps
          if ((kv.value()[kv.value().length - 1] & Const.MS_MIXED_COMPACT) ==
            Const.MS_MIXED_COMPACT) {
            ms_in_row = s_in_row = true;
          }
        } else {
          if (Internal.inMilliseconds(qual[0])) {
            ms_in_row = true;
          } else {
            s_in_row = true;
          }
         
          if (len > longest.qualifier().length) {
            longest = kv;
            longest_idx = i;
          }
         
          // there may be a situation where two second columns are concatenated
          // into 4 bytes. If so, we need to perform a complex compaction
          if (len == 4) {
            if (!Internal.inMilliseconds(qual[0])) {
              trivial = false;
            }
            val_len += kv.value().length;
          } else {
            // We don't need it below for complex compactions, so we update it
            // only here in the `else' branch.
            final byte[] v = kv.value();
            val_len += Internal.floatingPointValueToFix(qual[1], v) ? 4 : v.length;
          }
        }
        qual_len += len;
      }

      if (row.size() < 2) {
        // We got here because we started off with at least 2 KV, but we
        // chose to ignore some in the mean time, so now we're left with
        // either none, or just one.
        if (row.isEmpty()) {
          return null// No KV left, just ignore this whole row.
        } // else: row.size() == 1
        // We have only one KV left, we call ourselves recursively to handle
        // the case where this KV is an old, incorrectly encoded floating
        // point value that needs to be fixed.  This is guaranteed to not
        // recurse again.
        return compact(row, compacted, annotations);
      } else if (trivial) {
        trivial_compactions.incrementAndGet();
        compact = trivialCompact(row, qual_len, val_len, (ms_in_row && s_in_row));
      } else {
        complex_compactions.incrementAndGet();
        compact = complexCompact(row, qual_len / 2, (ms_in_row && s_in_row));
        // Now it's vital that we check whether the compact KV has the same
        // qualifier as one of the qualifiers that were already in the row.
        // Otherwise we might do a `put' in this cell, followed by a delete.
        // We don't want to delete what we just wrote.
        // This can happen if this row was already compacted but someone
        // wrote another individual data point at the same timestamp.
        // Optimization: since we kept track of which KV had the longest
        // qualifier, we can opportunistically check here if it happens to
        // have the same qualifier as the one we just created.
        final byte[] qual = compact.qualifier();
        final byte[] longest_qual = longest.qualifier();
        if (qual.length <= longest_qual.length) {
          KeyValue dup = null;
          int dup_idx = -1;
          if (Bytes.equals(longest_qual, qual)) {
            dup = longest;
            dup_idx = longest_idx;
          } else {
            // Worst case: to be safe we have to loop again and check all
            // the qualifiers and make sure we're not going to overwrite
            // anything.
            // TODO(tsuna): Try to write a unit test that triggers this code
            // path.  I'm not even sure it's possible.  Should we replace
            // this code with an `assert false: "should never be here"'?
            for (int i = 0; i < nkvs; i++) {
              final KeyValue kv = row.get(i);
              if (Bytes.equals(kv.qualifier(), qual)) {
                dup = kv;
                dup_idx = i;
                break;
              }
            }
          }
          if (dup != null) {
            // So we did find an existing KV with the same qualifier.
            // Let's check if, by chance, the value is the same too.
            if (Bytes.equals(dup.value(), compact.value())) {
              // Since the values are the same, we don't need to write
              // anything.  There's already a properly compacted version of
              // this row in TSDB.
              write = false;
            }
            // Now let's make sure we don't delete this qualifier.  This
            // re-allocates the entire array, but should be a rare case.
            row.remove(dup_idx);
          } // else: no dup, we're good.
        } // else: most common case: the compact qualifier is longer than
          // the previously longest qualifier, so we cannot possibly
          // overwrite an existing cell we would then delete.
      }
    }
    if (compacted != null) {  // Caller is interested in the compacted form.
      compacted[0] = compact;
      final long base_time = Bytes.getUnsignedInt(compact.key(), metric_width);
      final long cut_off = System.currentTimeMillis() / 1000
        - Const.MAX_TIMESPAN - 1;
      if (base_time > cut_off) {  // If row is too recent...
        return null;              // ... Don't write back compacted.
      }
    }
    if (!tsdb.config.enable_compactions()) {
      return null;
    }

    final byte[] key = compact.key();
    //LOG.debug("Compacting row " + Arrays.toString(key));
    deleted_cells.addAndGet(row.size())// We're going to delete this.
    if (write) {
      final byte[] qual = compact.qualifier();
      final byte[] value = compact.value();
      written_cells.incrementAndGet();
      return tsdb.put(key, qual, value)
        .addCallbacks(new DeleteCompactedCB(row), handle_write_error);
    } else {
      // We had nothing to write, because one of the cells is already the
      // correctly compacted version, so we can go ahead and delete the
      // individual cells directly.
      new DeleteCompactedCB(row).call(null);
      return null;
    }
  }

  /**
   * Performs a trivial compaction of a row.
   * <p>
   * This method is to be used only when all the cells in the given row
   * are individual data points (nothing has been compacted yet).  If any of
   * the cells have already been compacted, the caller is expected to call
   * {@link #complexCompact} instead.
   * @param row The row to compact.  Assumed to have 2 elements or more.
   * @param qual_len Exact number of bytes to hold the compacted qualifiers.
   * @param val_len Exact number of bytes to hold the compacted values.
   * @param sort Whether or not we have a mix of ms and s qualifiers and need
   * to manually sort
   * @return a {@link KeyValue} containing the result of the merge of all the
   * {@code KeyValue}s given in argument.
   */
  private static KeyValue trivialCompact(final ArrayList<KeyValue> row,
                                         final int qual_len,
                                         final int val_len,
                                         final boolean sort) {
    // Now let's simply concatenate all the qualifiers and values together.
    final byte[] qualifier = new byte[qual_len];
    final byte[] value = new byte[val_len];
    // Now populate the arrays by copying qualifiers/values over.
    int qual_idx = 0;
    int val_idx = 0;
    int last_delta = -1// Time delta, extracted from the qualifier.
   
    if (sort) {
      // we have a mix of millisecond and second columns so we need to sort them
      // by timestamp before compaction
      Collections.sort(row, COMPARATOR);
    }
   
    for (final KeyValue kv : row) {
      final byte[] q = kv.qualifier();
      // We shouldn't get into this function if this isn't true.
      assert q.length == 2 || q.length == 4:
        "Qualifier length must be 2 or 4: " + kv;
     
      // check to make sure that the row was already sorted, or if there was a
      // mixture of second and ms timestamps, that we sorted successfully
      final int delta = Internal.getOffsetFromQualifier(q);
      if (delta <= last_delta) {
        throw new IllegalDataException("Found out of order or duplicate"
          + " data: last_delta=" + last_delta + ", delta=" + delta
          + ", offending KV=" + kv + ", row=" + row + " -- run an fsck.");
      }
      last_delta = delta;
     
      final byte[] v;
      if (q.length == 2) {
        v = Internal.fixFloatingPointValue(q[1], kv.value());
        qualifier[qual_idx++] = q[0];
        qualifier[qual_idx++] = Internal.fixQualifierFlags(q[1], v.length);
      } else {
        v = kv.value();
        System.arraycopy(q, 0, qualifier, qual_idx, q.length);
        qual_idx += q.length;
      }
      System.arraycopy(v, 0, value, val_idx, v.length);
      val_idx += v.length;
    }
    // Set the meta flag in the values if we have a mix of seconds and ms,
    // otherwise we just leave them alone.
    if (sort) {
      value[value.length - 1] |= Const.MS_MIXED_COMPACT;
    }
    final KeyValue first = row.get(0);
    return new KeyValue(first.key(), first.family(), qualifier, value);
  }

  /**
   * Compacts a partially compacted row.
   * <p>
   * This method is called in the non-trivial re-compaction cases, where a row
   * already contains one or more partially compacted cells.  This can happen
   * for various reasons, such as TSDs dying in the middle of a compaction or
   * races involved with TSDs trying to compact the same row at the same
   * time, or old data being slowly written to a TSD.
   * @param row The row to compact.  Assumed to have 2 elements or more.
   * @param estimated_nvalues Estimate of the number of values to compact.
   * Used to pre-allocate a collection of the right size, so it's better to
   * overshoot a bit to avoid re-allocations.
   * @param sort Whether or not we have a mix of ms and s qualifiers and need
   * to manually sort
   * @return a {@link KeyValue} containing the result of the merge of all the
   * {@code KeyValue}s given in argument.
   * @throws IllegalDataException if one of the cells cannot be read because
   * it's corrupted or in a format we don't understand.
   */
  static KeyValue complexCompact(final ArrayList<KeyValue> row,
                                 final int estimated_nvalues,
                                 final boolean sort) {
    // We know at least one of the cells contains multiple values, and we need
    // to merge all the cells together in a sorted fashion.  We use a simple
    // strategy: split all the cells into individual objects, sort them,
    // merge the result while ignoring duplicates (same qualifier & value).
    final ArrayList<Cell> cells =
      Internal.extractDataPoints(row, estimated_nvalues);

    if (sort) {
      // we have a mix of millisecond and second columns so we need to sort them
      // by timestamp before compaction
      Collections.sort(row, new Internal.KeyValueComparator());
    }
   
    // Now let's do one pass first to compute the length of the compacted
    // value and to find if we have any bad duplicates (same qualifier,
    // different value).
    int qual_len = 0;
    int val_len = 1// Reserve an extra byte for meta-data.
    int last_delta = -1// Time delta, extracted from the qualifier.
    int ncells = cells.size();
    for (int i = 0; i < ncells; i++) {
      final Cell cell = cells.get(i);
      final int delta = Internal.getOffsetFromQualifier(cell.qualifier);
     
      // Because we sorted `cells' by qualifier, and because the time delta
      // occupies the most significant bits, this should never trigger.
      assert delta >= last_delta: ("WTF? It's supposed to be sorted: " + cells
                                   + " at " + i + " delta=" + delta
                                   + ", last_delta=" + last_delta);
      // The only troublesome case is where we have two (or more) consecutive
      // cells with the same time delta, but different flags or values.
      if (delta == last_delta) {
        // Find the previous cell.  Because we potentially replace the one
        // right before `i' with a tombstone, we might need to look further
        // back a bit more.
        Cell prev = Cell.SKIP;
        // i > 0 because we can't get here during the first iteration.
        // Also the first Cell cannot be Cell.SKIP, so `j' will never
        // underflow.  And even if it does, we'll get an exception.
        for (int j = i - 1; prev == Cell.SKIP; j--) {
          prev = cells.get(j);
        }
        if (cell.qualifier[1] != prev.qualifier[1]
            || !Bytes.equals(cell.value, prev.value)) {
          throw new IllegalDataException("Found out of order or duplicate"
            + " data: cell=" + cell + ", delta=" + delta + ", prev cell="
            + prev + ", last_delta=" + last_delta + ", in row=" + row
            + " -- run an fsck.");
        }
        // else: we're good, this is a true duplicate (same qualifier & value).
        // Just replace it with a tombstone so we'll skip it.  We don't delete
        // it from the array because that would cause a re-allocation.
        cells.set(i, Cell.SKIP);
        continue;
      }
      last_delta = delta;
      qual_len += cell.qualifier.length;
      val_len += cell.value.length;
    }

    final byte[] qualifier = new byte[qual_len];
    final byte[] value = new byte[val_len];
    // Now populate the arrays by copying qualifiers/values over.
    int qual_idx = 0;
    int val_idx = 0;
    for (final Cell cell : cells) {
      if (cell == Cell.SKIP) {
        continue;
      }
      byte[] b = cell.qualifier;
      System.arraycopy(b, 0, qualifier, qual_idx, b.length);
      qual_idx += b.length;
      b = cell.value;
      System.arraycopy(b, 0, value, val_idx, b.length);
      val_idx += b.length;
    }
   
    // Set the meta flag in the values if we have a mix of seconds and ms,
    // otherwise we just leave them alone.
    if (sort) {
      value[value.length - 1] |= Const.MS_MIXED_COMPACT;
    }
    final KeyValue first = row.get(0);
    final KeyValue kv = new KeyValue(first.key(), first.family(),
                                     qualifier, value);
    return kv;
  }

  /**
   * Callback to delete a row that's been successfully compacted.
   */
  private final class DeleteCompactedCB implements Callback<Object, Object> {

    /** What we're going to delete.  */
    private final byte[] key;
    private final byte[][] qualifiers;

    public DeleteCompactedCB(final ArrayList<KeyValue> cells) {
      final KeyValue first = cells.get(0);
      key = first.key();
      qualifiers = new byte[cells.size()][];
      for (int i = 0; i < qualifiers.length; i++) {
        qualifiers[i] = cells.get(i).qualifier();
      }
    }

    public Object call(final Object arg) {
      return tsdb.delete(key, qualifiers).addErrback(handle_delete_error);
    }

    public String toString() {
      return "delete compacted cells";
    }

  }

  private final HandleErrorCB handle_read_error = new HandleErrorCB("read");
  private final HandleErrorCB handle_write_error = new HandleErrorCB("write");
  private final HandleErrorCB handle_delete_error = new HandleErrorCB("delete");

  /**
   * Callback to handle exceptions during the compaction process.
   */
  private final class HandleErrorCB implements Callback<Object, Exception> {

    private volatile int errors;

    private final String what;

    /**
     * Constructor.
     * @param what String describing what kind of operation (e.g. "read").
     */
    public HandleErrorCB(final String what) {
      this.what = what;
    }

    public Object call(final Exception e) {
      if (e instanceof PleaseThrottleException) {  // HBase isn't keeping up.
        final HBaseRpc rpc = ((PleaseThrottleException) e).getFailedRpc();
        if (rpc instanceof HBaseRpc.HasKey) {
          // We failed to compact this row.  Whether it's because of a failed
          // get, put or delete, we should re-schedule this row for a future
          // compaction.
          add(((HBaseRpc.HasKey) rpc).key());
          return Boolean.TRUE;  // We handled it, so don't return an exception.
        } else // Should never get in this clause.
          LOG.error("WTF?  Cannot retry this RPC, and this shouldn't happen: "
                    + rpc);
        }
      }
      // `++' is not atomic but doesn't matter if we miss some increments.
      if (++errors % 100 == 1) {  // Basic rate-limiting to not flood logs.
        LOG.error("Failed to " + what + " a row to re-compact", e);
      }
      return e;
    }

    public String toString() {
      return "handle " + what + " error";
    }
  }

  static final long serialVersionUID = 1307386642;

  /** Starts a compaction thread.  Only one such thread is needed.  */
  private void startCompactionThread() {
    final Thrd thread = new Thrd();
    thread.setDaemon(true);
    thread.start();
  }

  /** How frequently the compaction thread wakes up flush stuff.  */
  // TODO(tsuna): Make configurable?
  private static final int FLUSH_INTERVAL = 10// seconds

  /** Minimum number of rows we'll attempt to compact at once.  */
  // TODO(tsuna): Make configurable?
  private static final int MIN_FLUSH_THRESHOLD = 100// rows

  /** Maximum number of rows we'll compact concurrently.  */
  // TODO(tsuna): Make configurable?
  private static final int MAX_CONCURRENT_FLUSHES = 10000// rows

  /** If this is X then we'll flush X times faster than we really need.  */
  // TODO(tsuna): Make configurable?
  private static final int FLUSH_SPEED = 2// multiplicative factor

  /**
   * Background thread to trigger periodic compactions.
   */
  final class Thrd extends Thread {
    public Thrd() {
      super("CompactionThread");
    }

    public void run() {
      while (true) {
        try {
          final int size = size();
          // Flush if  we have too many rows to recompact.
          // Note that in we might not be able to actually
          // flush anything if the rows aren't old enough.
          if (size > MIN_FLUSH_THRESHOLD) {
            // How much should we flush during this iteration?  This scheme is
            // adaptive and flushes at a rate that is proportional to the size
            // of the queue, so we flush more aggressively if the queue is big.
            // Let's suppose MAX_TIMESPAN = 1h.  We have `size' rows to compact,
            // and we better compact them all in less than 1h, otherwise we're
            // going to "fall behind" when after a new hour starts (as we'll be
            // inserting a ton of new rows then).  So slice MAX_TIMESPAN using
            // FLUSH_INTERVAL to compute what fraction of `size' we need to
            // flush at each iteration.  Note that `size' will usually account
            // for many rows that can't be flushed yet (not old enough) so we're
            // overshooting a bit (flushing more aggressively than necessary).
            // This isn't a problem at all.  The only thing that matters is that
            // the rate at which we flush stuff is proportional to how much work
            // is sitting in the queue.  The multiplicative factor FLUSH_SPEED
            // is added to make flush even faster than we need.  For example, if
            // FLUSH_SPEED is 2, then instead of taking 1h to flush what we have
            // for the previous hour, we'll take only 30m.  This is desirable so
            // that we evict old entries from the queue a bit faster.
            final int maxflushes = Math.max(MIN_FLUSH_THRESHOLD,
              size * FLUSH_INTERVAL * FLUSH_SPEED / Const.MAX_TIMESPAN);
            final long now = System.currentTimeMillis();
            flush(now / 1000 - Const.MAX_TIMESPAN - 1, maxflushes);
            if (LOG.isDebugEnabled()) {
              final int newsize = size();
              LOG.debug("flush() took " + (System.currentTimeMillis() - now)
                        + "ms, new queue size=" + newsize
                        + " (" + (newsize - size) + ')');
            }
          }
        } catch (Exception e) {
          LOG.error("Uncaught exception in compaction thread", e);
        } catch (OutOfMemoryError e) {
          // Let's free up some memory by throwing away the compaction queue.
          final int sz = size.get();
          CompactionQueue.super.clear();
          size.set(0);
          LOG.error("Discarded the compaction queue, size=" + sz, e);
        } catch (Throwable e) {
          LOG.error("Uncaught *Throwable* in compaction thread", e);
          // Catching this kind of error is totally unexpected and is really
          // bad.  If we do nothing and let this thread die, we'll run out of
          // memory as new entries are added to the queue.  We could always
          // commit suicide, but it's kind of drastic and nothing else in the
          // code does this.  If `enable_compactions' wasn't final, we could
          // always set it to false, but that's not an option.  So in order to
          // try to get a fresh start, let this compaction thread terminate
          // and spin off a new one instead.
          try {
            Thread.sleep(1000)// Avoid busy looping creating new threads.
          } catch (InterruptedException i) {
            LOG.error("Compaction thread interrupted in error handling", i);
            return// Don't flush, we're truly hopeless.
          }
          startCompactionThread();
          return;
        }
        try {
          Thread.sleep(FLUSH_INTERVAL * 1000);
        } catch (InterruptedException e) {
          LOG.error("Compaction thread interrupted, doing one last flush", e);
          flush();
          return;
        }
      }
    }
  }

  /**
   * Helper to sort the byte arrays in the compaction queue.
   * <p>
   * This comparator sorts things by timestamp first, this way we can find
   * all rows of the same age at once.
   */
  private static final class Cmp implements Comparator<byte[]> {

    /** On how many bytes do we encode metrics IDs.  */
    private final short metric_width;

    public Cmp(final TSDB tsdb) {
      metric_width = tsdb.metrics.width();
    }

    public int compare(final byte[] a, final byte[] b) {
      final int c = Bytes.memcmp(a, b, metric_width, Const.TIMESTAMP_BYTES);
      // If the timestamps are equal, sort according to the entire row key.
      return c != 0 ? c : Bytes.memcmp(a, b);
    }
  }

}
TOP

Related Classes of net.opentsdb.core.CompactionQueue$Thrd

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.