Source Code of org.apache.accumulo.core.iterators.LargeRowFilter

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.core.iterators;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;


import org.apache.accumulo.core.data.ArrayByteSequence;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.PartialKey;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.iterators.IteratorEnvironment;
import org.apache.accumulo.core.iterators.OptionDescriber;
import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
import org.apache.hadoop.io.Text;


/**
 * This iterator suppresses rows that exceed a specified number of columns. Once a row exceeds the threshold, a marker is emitted and the row is always
 * suppressed by this iterator after that point in time.
 * 
 * This iterator works in a similar way to the RowDeletingIterator. See its javadoc about locality groups.
 */
public class LargeRowFilter implements SortedKeyValueIterator<Key,Value>, OptionDescriber {
  
  public static final Value SUPPRESS_ROW_VALUE = new Value("SUPPRESS_ROW".getBytes());
  
  private static final ByteSequence EMPTY = new ArrayByteSequence(new byte[] {});
  
  /* key into hash map, value refers to the row suppression limit (maxColumns) */
  public static final String MAX_COLUMNS = "max_columns";
  
  private SortedKeyValueIterator<Key,Value> source;
  
  // a cache of keys
  private ArrayList<Key> keys = new ArrayList<Key>();
  private ArrayList<Value> values = new ArrayList<Value>();
  
  private int currentPosition;
  
  private int maxColumns;
  
  private boolean propogateSuppression = false;
  
  private Range range;
  private Collection<ByteSequence> columnFamilies;
  private boolean inclusive;
  private boolean dropEmptyColFams;
  
  private boolean isSuppressionMarker(Key key, Value val) {
    return key.getColumnFamilyData().length() == 0 && key.getColumnQualifierData().length() == 0 && key.getColumnVisibilityData().length() == 0
        && val.equals(SUPPRESS_ROW_VALUE);
  }
  
  private void reseek(Key key) throws IOException {
    if (range.afterEndKey(key)) {
      range = new Range(range.getEndKey(), true, range.getEndKey(), range.isEndKeyInclusive());
      source.seek(range, columnFamilies, inclusive);
    } else {
      range = new Range(key, true, range.getEndKey(), range.isEndKeyInclusive());
      source.seek(range, columnFamilies, inclusive);
    }
  }
  
  private void consumeRow(ByteSequence row) throws IOException {
    // try reading a few and if still not to next row, then seek
    int count = 0;
    
    while (source.hasTop() && source.getTopKey().getRowData().equals(row)) {
      source.next();
      count++;
      if (count >= 10) {
        Key nextRowStart = new Key(new Text(row.toArray())).followingKey(PartialKey.ROW);
        reseek(nextRowStart);
        count = 0;
      }
    }
  }
  
  private void addKeyValue(Key k, Value v) {
    if (dropEmptyColFams && k.getColumnFamilyData().equals(EMPTY)) {
      return;
    }
    keys.add(new Key(k));
    values.add(new Value(v));
  }
  
  private void bufferNextRow() throws IOException {
    
    keys.clear();
    values.clear();
    currentPosition = 0;
    
    while (source.hasTop() && keys.size() == 0) {
      
      addKeyValue(source.getTopKey(), source.getTopValue());
      
      if (isSuppressionMarker(source.getTopKey(), source.getTopValue())) {
        
        consumeRow(source.getTopKey().getRowData());
        
      } else {
        
        ByteSequence currentRow = keys.get(0).getRowData();
        source.next();
        
        while (source.hasTop() && source.getTopKey().getRowData().equals(currentRow)) {
          
          addKeyValue(source.getTopKey(), source.getTopValue());
          
          if (keys.size() > maxColumns) {
            keys.clear();
            values.clear();
            
            // when the row is to big, just emit a suppression
            // marker
            addKeyValue(new Key(new Text(currentRow.toArray())), SUPPRESS_ROW_VALUE);
            consumeRow(currentRow);
          } else {
            source.next();
          }
        }
      }
      
    }
  }
  
  private void readNextRow() throws IOException {
    
    bufferNextRow();
    
    while (!propogateSuppression && currentPosition < keys.size() && isSuppressionMarker(keys.get(0), values.get(0))) {
      bufferNextRow();
    }
  }
  
  private LargeRowFilter(SortedKeyValueIterator<Key,Value> source, boolean propogateSuppression, int maxColumns) {
    this.source = source;
    this.propogateSuppression = propogateSuppression;
    this.maxColumns = maxColumns;
  }
  
  public LargeRowFilter() {}
  
  @Override
  public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
    this.source = source;
    this.maxColumns = Integer.parseInt(options.get(MAX_COLUMNS));
    this.propogateSuppression = env.getIteratorScope() != IteratorScope.scan;
  }
  
  @Override
  public boolean hasTop() {
    return currentPosition < keys.size();
  }
  
  @Override
  public void next() throws IOException {
    
    if (currentPosition >= keys.size()) {
      throw new IllegalStateException("Called next() when hasTop() is false");
    }
    
    currentPosition++;
    
    if (currentPosition == keys.size()) {
      readNextRow();
    }
  }
  
  @Override
  public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
    
    if (inclusive && !columnFamilies.contains(EMPTY)) {
      columnFamilies = new HashSet<ByteSequence>(columnFamilies);
      columnFamilies.add(EMPTY);
      dropEmptyColFams = true;
    } else if (!inclusive && columnFamilies.contains(EMPTY)) {
      columnFamilies = new HashSet<ByteSequence>(columnFamilies);
      columnFamilies.remove(EMPTY);
      dropEmptyColFams = true;
    } else {
      dropEmptyColFams = false;
    }
    
    this.range = range;
    this.columnFamilies = columnFamilies;
    this.inclusive = inclusive;
    
    if (range.getStartKey() != null) {
      // seek to beginning of row to see if there is a suppression marker
      Range newRange = new Range(new Key(range.getStartKey().getRow()), true, range.getEndKey(), range.isEndKeyInclusive());
      source.seek(newRange, columnFamilies, inclusive);
      
      readNextRow();
      
      // it is possible that all or some of the data read for the current
      // row is before the start of the range
      while (currentPosition < keys.size() && range.beforeStartKey(keys.get(currentPosition)))
        currentPosition++;
      
      if (currentPosition == keys.size())
        readNextRow();
      
    } else {
      source.seek(range, columnFamilies, inclusive);
      readNextRow();
    }
    
  }
  
  @Override
  public Key getTopKey() {
    return keys.get(currentPosition);
  }
  
  @Override
  public Value getTopValue() {
    return values.get(currentPosition);
  }
  
  @Override
  public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) {
    return new LargeRowFilter(source.deepCopy(env), propogateSuppression, maxColumns);
  }
  
  @Override
  public IteratorOptions describeOptions() {
    String description = "This iterator suppresses rows that exceed a specified number of columns. Once\n"
        + "a row exceeds the threshold, a marker is emitted and the row is always\n" + "suppressed by this iterator after that point in time.\n"
        + " This iterator works in a similar way to the RowDeletingIterator. See its\n" + " javadoc about locality groups.\n";
    return new IteratorOptions(this.getClass().getSimpleName(), description, Collections.singletonMap(MAX_COLUMNS, "Number Of Columns To Begin Suppression"),
        null);
  }
  
  @Override
  public boolean validateOptions(Map<String,String> options) {
    if (options == null || options.size() < 1) {
      System.out.println("Bad # of options, must supply: " + MAX_COLUMNS + " as value");
      return false;
    }
    
    if (options.containsKey(MAX_COLUMNS)) {
      try {
        maxColumns = Integer.parseInt(options.get(MAX_COLUMNS));
      } catch (NumberFormatException e) {
        e.printStackTrace();
        return false;
      }
    } else {
      System.out.println("Need to have " + MAX_COLUMNS);
      return false;
    }
    
    return true;
  }
  
}
Source Code of org.apache.accumulo.core.iterators.LargeRowFilter

Related Classes of org.apache.accumulo.core.iterators.LargeRowFilter