Source Code of org.apache.cassandra.db.ColumnFamilyStore

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.cassandra.db;


import java.io.File;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import javax.management.MBeanServer;
import javax.management.ObjectName;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantReadWriteLock;


import org.apache.log4j.Logger;


import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.io.DataInputBuffer;
import org.apache.cassandra.io.DataOutputBuffer;
import org.apache.cassandra.io.IndexHelper;
import org.apache.cassandra.io.SSTable;
import org.apache.cassandra.io.SequenceFile;
import org.apache.cassandra.net.EndPoint;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.service.IPartitioner;
import org.apache.cassandra.utils.BloomFilter;
import org.apache.cassandra.utils.FileUtils;
import org.apache.cassandra.utils.LogUtil;


/**
 * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
 */


public class ColumnFamilyStore implements ColumnFamilyStoreMBean
{
    private static int threshHold_ = 4;
    private static final int bufSize_ = 128*1024*1024;
    private static int compactionMemoryThreshold_ = 1 << 30;
    private static Logger logger_ = Logger.getLogger(ColumnFamilyStore.class);


    private final String table_;
    public final String columnFamily_;
    private final boolean isSuper_;
    
    private volatile Integer memtableSwitchCount = 0;


    /* This is used to generate the next index for a SSTable */
    private AtomicInteger fileIndexGenerator_ = new AtomicInteger(0);


    /* memtable associated with this ColumnFamilyStore. */
    private AtomicReference<Memtable> memtable_;
    private AtomicReference<BinaryMemtable> binaryMemtable_;


    /* SSTables on disk for this column family */
    private Set<String> ssTables_ = new HashSet<String>();


    /* Modification lock used for protecting reads from compactions. */
    private ReentrantReadWriteLock lock_ = new ReentrantReadWriteLock(true);


    /* Flag indicates if a compaction is in process */
    private AtomicBoolean isCompacting_ = new AtomicBoolean(false);


    ColumnFamilyStore(String table, String columnFamily, boolean isSuper, int indexValue) throws IOException
    {
        table_ = table;
        columnFamily_ = columnFamily;
        isSuper_ = isSuper;
        fileIndexGenerator_.set(indexValue);
        memtable_ = new AtomicReference<Memtable>(new Memtable(table_, columnFamily_));
        binaryMemtable_ = new AtomicReference<BinaryMemtable>(new BinaryMemtable(table_, columnFamily_));
    }


    public static ColumnFamilyStore getColumnFamilyStore(String table, String columnFamily) throws IOException
    {
        /*
         * Get all data files associated with old Memtables for this table.
         * These files are named as follows <Table>-1.db, ..., <Table>-n.db. Get
         * the max which in this case is n and increment it to use it for next
         * index.
         */
        List<Integer> indices = new ArrayList<Integer>();
        String[] dataFileDirectories = DatabaseDescriptor.getAllDataFileLocations();
        for ( String directory : dataFileDirectories )
        {
            File fileDir = new File(directory);
            File[] files = fileDir.listFiles();
            for (File file : files)
            {
                String filename = file.getName();
                String[] tblCfName = getTableAndColumnFamilyName(filename);


                if (tblCfName[0].equals(table)
                        && tblCfName[1].equals(columnFamily))
                {
                    int index = getIndexFromFileName(filename);
                    indices.add(index);
                }
            }
        }
        Collections.sort(indices);
        int value = (indices.size() > 0) ? (indices.get(indices.size() - 1)) : 0;


        ColumnFamilyStore cfs = new ColumnFamilyStore(table, columnFamily, "Super".equals(DatabaseDescriptor.getColumnType(columnFamily)), value);


        MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
        try
        {
            mbs.registerMBean(cfs, new ObjectName(
                    "org.apache.cassandra.db:type=ColumnFamilyStore-" + table + "." + columnFamily));
        }
        catch (Exception e)
        {
            throw new RuntimeException(e);
        }


        return cfs;
    }


    void onStart() throws IOException
    {
        /* Do major compaction */
        List<File> ssTables = new ArrayList<File>();
        String[] dataFileDirectories = DatabaseDescriptor.getAllDataFileLocations();
        for ( String directory : dataFileDirectories )
        {
            File fileDir = new File(directory);
            File[] files = fileDir.listFiles();
            for (File file : files)
            {
                String filename = file.getName();
                if(((file.length() == 0) || (filename.contains("-" + SSTable.temporaryFile_)) ) && (filename.contains(columnFamily_)))
                {
                  file.delete();
                  continue;
                }
                
                String[] tblCfName = getTableAndColumnFamilyName(filename);
                if (tblCfName[0].equals(table_)
                        && tblCfName[1].equals(columnFamily_)
                        && filename.contains("-Data.db"))
                {
                    ssTables.add(file.getAbsoluteFile());
                }
            }
        }
        Collections.sort(ssTables, new FileUtils.FileComparator());
        List<String> filenames = new ArrayList<String>();
        for (File ssTable : ssTables)
        {
            filenames.add(ssTable.getAbsolutePath());
        }


        /* There are no files to compact just add to the list of SSTables */
        ssTables_.addAll(filenames);
        /* Load the index files and the Bloom Filters associated with them. */
        SSTable.onStart(filenames);
        logger_.debug("Submitting a major compaction task ...");
        MinorCompactionManager.instance().submit(ColumnFamilyStore.this);
        if(columnFamily_.equals(Table.hints_))
        {
          HintedHandOffManager.instance().submit(this);
        }
        // TODO this seems unnecessary -- each memtable flush checks to see if it needs to compact, too
        MinorCompactionManager.instance().submitPeriodicCompaction(this);
    }


    List<String> getAllSSTablesOnDisk()
    {
        return new ArrayList<String>(ssTables_);
    }


    /*
     * This method is called to obtain statistics about
     * the Column Family represented by this Column Family
     * Store. It will report the total number of files on
     * disk and the total space oocupied by the data files
     * associated with this Column Family.
    */
    public String cfStats(String newLineSeparator)
    {
        StringBuilder sb = new StringBuilder();
        /*
         * We want to do this so that if there are
         * no files on disk we do not want to display
         * something ugly on the admin page.
        */
        if ( ssTables_.size() == 0 )
        {
            return sb.toString();
        }
        sb.append(columnFamily_ + " statistics :");
        sb.append(newLineSeparator);
        sb.append("Number of files on disk : " + ssTables_.size());
        sb.append(newLineSeparator);
        double totalSpace = 0d;
        for ( String file : ssTables_ )
        {
            File f = new File(file);
            totalSpace += f.length();
        }
        String diskSpace = FileUtils.stringifyFileSize(totalSpace);
        sb.append("Total disk space : " + diskSpace);
        sb.append(newLineSeparator);
        sb.append("--------------------------------------");
        sb.append(newLineSeparator);
        return sb.toString();
    }


    /*
     * This is called after bootstrap to add the files
     * to the list of files maintained.
    */
    void addToList(String file)
    {
      lock_.writeLock().lock();
        try
        {
            ssTables_.add(file);
        }
        finally
        {
          lock_.writeLock().unlock();
        }
    }


    void touch(String key, boolean fData) throws IOException
    {
        /* Scan the SSTables on disk first */
        lock_.readLock().lock();
        try
        {
            List<String> files = new ArrayList<String>(ssTables_);
            for (String file : files)
            {
                /*
                 * Get the BloomFilter associated with this file. Check if the key
                 * is present in the BloomFilter. If not continue to the next file.
                */
                boolean bVal = SSTable.isKeyInFile(key, file);
                if ( !bVal )
                    continue;
                SSTable ssTable = new SSTable(file);
                ssTable.touch(key, fData);
            }
        }
        finally
        {
            lock_.readLock().unlock();
        }
    }


    /*
     * This method forces a compaction of the SSTables on disk. We wait
     * for the process to complete by waiting on a future pointer.
    */
    boolean forceCompaction(List<Range> ranges, EndPoint target, long skip, List<String> fileList)
    {        
      Future<Boolean> futurePtr = null;
      if( ranges != null)
        futurePtr = MinorCompactionManager.instance().submit(ColumnFamilyStore.this, ranges, target, fileList);
      else
        MinorCompactionManager.instance().submitMajor(ColumnFamilyStore.this, skip);
      
        boolean result = true;
        try
        {
            /* Waiting for the compaction to complete. */
          if(futurePtr != null)
            result = futurePtr.get();
            logger_.debug("Done forcing compaction ...");
        }
        catch (ExecutionException ex)
        {
            logger_.debug(LogUtil.throwableToString(ex));
        }
        catch ( InterruptedException ex2 )
        {
            logger_.debug(LogUtil.throwableToString(ex2));
        }
        return result;
    }


    String getColumnFamilyName()
    {
        return columnFamily_;
    }


    private static String[] getTableAndColumnFamilyName(String filename)
    {
        StringTokenizer st = new StringTokenizer(filename, "-");
        String[] values = new String[2];
        int i = 0;
        while (st.hasMoreElements())
        {
            if (i == 0)
                values[i] = (String) st.nextElement();
            else if (i == 1)
            {
                values[i] = (String) st.nextElement();
                break;
            }
            ++i;
        }
        return values;
    }


    protected static int getIndexFromFileName(String filename)
    {
        /*
         * File name is of the form <table>-<column family>-<index>-Data.db.
         * This tokenizer will strip the .db portion.
         */
        StringTokenizer st = new StringTokenizer(filename, "-");
        /*
         * Now I want to get the index portion of the filename. We accumulate
         * the indices and then sort them to get the max index.
         */
        int count = st.countTokens();
        int i = 0;
        String index = null;
        while (st.hasMoreElements())
        {
            index = (String) st.nextElement();
            if (i == (count - 2))
                break;
            ++i;
        }
        return Integer.parseInt(index);
    }


    String getNextFileName()
    {
      // Psuedo increment so that we do not generate consecutive numbers 
      fileIndexGenerator_.incrementAndGet();
        return table_ + "-" + columnFamily_ + "-" + fileIndexGenerator_.incrementAndGet();
    }


    /*
     * Return a temporary file name.
     */
    String getTempFileName()
    {
      // Psuedo increment so that we do not generate consecutive numbers 
      fileIndexGenerator_.incrementAndGet();
        return table_ + "-" + columnFamily_ + "-" + SSTable.temporaryFile_ + "-" + fileIndexGenerator_.incrementAndGet();
    }


    /*
     * Return a temporary file name. Based on the list of files input 
     * This fn sorts the list and generates a number between he 2 lowest filenames 
     * ensuring uniqueness.
     * Since we do not generate consecutive numbers hence the lowest file number
     * can just be incremented to generate the next file. 
     */
    String getTempFileName( List<String> files)
    {
      int lowestIndex;
      int index;
      Collections.sort(files, new FileNameComparator(FileNameComparator.Ascending));
      
      if( files.size() <= 1)
        return null;
      lowestIndex = getIndexFromFileName(files.get(0));
       
       index = lowestIndex + 1 ;


        return table_ + "-" + columnFamily_ + "-" + SSTable.temporaryFile_ + "-" + index;
    }


    
    /*
     * This version is used only on start up when we are recovering from logs.
     * In the future we may want to parellelize the log processing for a table
     * by having a thread per log file present for recovery. Re-visit at that
     * time.
     */
    void switchMemtable(String key, ColumnFamily columnFamily, CommitLog.CommitLogContext cLogCtx) throws IOException
    {
        memtable_.set( new Memtable(table_, columnFamily_) );
        if(!key.equals(Memtable.flushKey_))
          memtable_.get().put(key, columnFamily, cLogCtx);
        
        if (memtableSwitchCount == Integer.MAX_VALUE)
        {
            memtableSwitchCount = 0;
        }
        memtableSwitchCount++;
    }


    /*
     * This version is used only on start up when we are recovering from logs.
     * In the future we may want to parellelize the log processing for a table
     * by having a thread per log file present for recovery. Re-visit at that
     * time.
     */
    void switchBinaryMemtable(String key, byte[] buffer) throws IOException
    {
        binaryMemtable_.set( new BinaryMemtable(table_, columnFamily_) );
        binaryMemtable_.get().put(key, buffer);
    }


    void forceFlush() throws IOException
    {
        memtable_.get().forceflush(this);
    }


    void forceBlockingFlush() throws IOException, ExecutionException, InterruptedException
    {
        forceFlush();
        // block for flush to finish by adding a no-op action to the flush executorservice
        // and waiting for that to finish.  (this works since flush ES is single-threaded.)
        Future f = MemtableManager.instance().flusher_.submit(new Runnable()
        {
            public void run()
            {
            }
        });
        f.get();
    }


    void forceFlushBinary()
    {
        BinaryMemtableManager.instance().submit(getColumnFamilyName(), binaryMemtable_.get());
        //binaryMemtable_.get().flush(true);
    }


    /**
     * Insert/Update the column family for this key. 
     * param @ lock - lock that needs to be used. 
     * param @ key - key for update/insert 
     * param @ columnFamily - columnFamily changes
    */
    void apply(String key, ColumnFamily columnFamily, CommitLog.CommitLogContext cLogCtx)
            throws IOException
    {
        memtable_.get().put(key, columnFamily, cLogCtx);
    }


    /*
     * Insert/Update the column family for this key. param @ lock - lock that
     * needs to be used. param @ key - key for update/insert param @
     * columnFamily - columnFamily changes
     */
    void applyBinary(String key, byte[] buffer)
            throws IOException
    {
        binaryMemtable_.get().put(key, buffer);
    }


    public ColumnFamily getColumnFamily(String key, String columnFamilyColumn, IFilter filter) throws IOException
    {
        List<ColumnFamily> columnFamilies = getColumnFamilies(key, columnFamilyColumn, filter);
        return resolveAndRemoveDeleted(columnFamilies);
    }


    /**
     *
     * Get the column family in the most efficient order.
     * 1. Memtable
     * 2. Sorted list of files
     */
    List<ColumnFamily> getColumnFamilies(String key, String columnFamilyColumn, IFilter filter) throws IOException
    {
        List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>();
        /* Get the ColumnFamily from Memtable */
        getColumnFamilyFromCurrentMemtable(key, columnFamilyColumn, filter, columnFamilies);
        if (columnFamilies.size() == 0 || !filter.isDone())
        {
            /* Check if MemtableManager has any historical information */
            MemtableManager.instance().getColumnFamily(key, columnFamily_, columnFamilyColumn, filter, columnFamilies);
        }
        if (columnFamilies.size() == 0 || !filter.isDone())
        {
            long start = System.currentTimeMillis();
            getColumnFamilyFromDisk(key, columnFamilyColumn, columnFamilies, filter);
            logger_.debug("DISK TIME: " + (System.currentTimeMillis() - start) + " ms.");
        }
        return columnFamilies;
    }


    /**
     * Fetch from disk files and go in sorted order  to be efficient
     * This fn exits as soon as the required data is found.
     * @param key
     * @param cf
     * @param columnFamilies
     * @param filter
     * @throws IOException
     */
    private void getColumnFamilyFromDisk(String key, String cf, List<ColumnFamily> columnFamilies, IFilter filter) throws IOException
    {
        /* Scan the SSTables on disk first */
        List<String> files = new ArrayList<String>();        
      lock_.readLock().lock();
        try
        {
            files.addAll(ssTables_);
            Collections.sort(files, new FileNameComparator(FileNameComparator.Descending));
        }
        finally
        {
            lock_.readLock().unlock();
        }
                          
        for (String file : files)
        {
            /*
             * Get the BloomFilter associated with this file. Check if the key
             * is present in the BloomFilter. If not continue to the next file.
            */
            boolean bVal = SSTable.isKeyInFile(key, file);
            if ( !bVal )
                continue;
            ColumnFamily columnFamily = fetchColumnFamily(key, cf, filter, file);
            long start = System.currentTimeMillis();
            if (columnFamily != null)
            {
                columnFamilies.add(columnFamily);
                if(filter.isDone())
                {
                  break;
                }
            }
            logger_.debug("DISK Data structure population  TIME: " + (System.currentTimeMillis() - start) + " ms.");
        }
    }




    private ColumnFamily fetchColumnFamily(String key, String cf, IFilter filter, String ssTableFile) throws IOException
  {
    SSTable ssTable = new SSTable(ssTableFile);
    long start = System.currentTimeMillis();
    DataInputBuffer bufIn;
    bufIn = filter.next(key, cf, ssTable);
    logger_.debug("DISK ssTable.next TIME: " + (System.currentTimeMillis() - start) + " ms.");
    if (bufIn.getLength() == 0)
      return null;
        start = System.currentTimeMillis();
        ColumnFamily columnFamily = ColumnFamily.serializer().deserialize(bufIn, cf, filter);
    logger_.debug("DISK Deserialize TIME: " + (System.currentTimeMillis() - start) + " ms.");
    if (columnFamily == null)
      return null;
    return columnFamily;
  }


    private void getColumnFamilyFromCurrentMemtable(String key, String cf, IFilter filter, List<ColumnFamily> columnFamilies)
    {
        /* Get the ColumnFamily from Memtable */
        ColumnFamily columnFamily = memtable_.get().get(key, cf, filter);
        if (columnFamily != null)
        {
            columnFamilies.add(columnFamily);
        }
    }
    
    /** merge all columnFamilies into a single instance, with only the newest versions of columns preserved. */
    static ColumnFamily resolve(List<ColumnFamily> columnFamilies)
    {
        int size = columnFamilies.size();
        if (size == 0)
            return null;


        // start from nothing so that we don't include potential deleted columns from the first instance
        ColumnFamily cf0 = columnFamilies.get(0);
        ColumnFamily cf = cf0.cloneMeShallow();


        // merge
        for (ColumnFamily cf2 : columnFamilies)
        {
            assert cf.name().equals(cf2.name());
            cf.addColumns(cf2);
            cf.delete(Math.max(cf.getLocalDeletionTime(), cf2.getLocalDeletionTime()),
                      Math.max(cf.getMarkedForDeleteAt(), cf2.getMarkedForDeleteAt()));
        }
        return cf;
    }


    /** like resolve, but leaves the resolved CF as the only item in the list */
    private static void merge(List<ColumnFamily> columnFamilies)
    {
        ColumnFamily cf = resolve(columnFamilies);
        columnFamilies.clear();
        columnFamilies.add(cf);
    }


    private static ColumnFamily resolveAndRemoveDeleted(List<ColumnFamily> columnFamilies) {
        ColumnFamily cf = resolve(columnFamilies);
        return removeDeleted(cf);
    }


    /*
     This is complicated because we need to preserve deleted columns, supercolumns, and columnfamilies
     until they have been deleted for at least GC_GRACE_IN_SECONDS.  But, we do not need to preserve
     their contents; just the object itself as a "tombstone" that can be used to repair other
     replicas that do not know about the deletion.
     */
    static ColumnFamily removeDeleted(ColumnFamily cf)
    {
        return removeDeleted(cf, (int)(System.currentTimeMillis() / 1000) - DatabaseDescriptor.getGcGraceInSeconds());
    }


    static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore)
    {
        if (cf == null)
            return null;


        for (String cname : new ArrayList<String>(cf.getColumns().keySet()))
        {
            IColumn c = cf.getColumns().get(cname);
            if (c instanceof SuperColumn)
            {
                long minTimestamp = Math.max(c.getMarkedForDeleteAt(), cf.getMarkedForDeleteAt());
                // don't operate directly on the supercolumn, it could be the one in the memtable
                cf.remove(cname);
                SuperColumn sc = new SuperColumn(cname);
                sc.markForDeleteAt(c.getLocalDeletionTime(), c.getMarkedForDeleteAt());
                for (IColumn subColumn : c.getSubColumns())
                {
                    if (subColumn.timestamp() >= minTimestamp)
                    {
                        if (!subColumn.isMarkedForDelete() || subColumn.getLocalDeletionTime() > gcBefore)
                        {
                            sc.addColumn(subColumn.name(), subColumn);
                        }
                    }
                }
                if (sc.getSubColumns().size() > 0 || sc.getLocalDeletionTime() > gcBefore)
                {
                    cf.addColumn(sc);
                }
            }
            else if ((c.isMarkedForDelete() && c.getLocalDeletionTime() <= gcBefore)
                     || c.timestamp() < cf.getMarkedForDeleteAt())
            {
                cf.remove(cname);
            }
        }


        if (cf.getColumnCount() == 0 && cf.getLocalDeletionTime() <= gcBefore)
        {
            return null;
        }
        return cf;
    }


    /*
     * This version is used only on start up when we are recovering from logs.
     * Hence no locking is required since we process logs on the main thread. In
     * the future we may want to parellelize the log processing for a table by
     * having a thread per log file present for recovery. Re-visit at that time.
     */
    void applyNow(String key, ColumnFamily columnFamily) throws IOException
    {
         memtable_.get().putOnRecovery(key, columnFamily);
    }


    /*
     * This method is called when the Memtable is frozen and ready to be flushed
     * to disk. This method informs the CommitLog that a particular ColumnFamily
     * is being flushed to disk.
     */
    void onMemtableFlush(CommitLog.CommitLogContext cLogCtx) throws IOException
    {
        if ( cLogCtx.isValidContext() )
            CommitLog.open(table_).onMemtableFlush(columnFamily_, cLogCtx);
    }


    /*
     * Called after the Memtable flushes its in-memory data. This information is
     * cached in the ColumnFamilyStore. This is useful for reads because the
     * ColumnFamilyStore first looks in the in-memory store and the into the
     * disk to find the key. If invoked during recoveryMode the
     * onMemtableFlush() need not be invoked.
     *
     * param @ filename - filename just flushed to disk
     * param @ bf - bloom filter which indicates the keys that are in this file.
    */
    void storeLocation(String filename, BloomFilter bf)
    {
        int ssTableSize = 0;
      lock_.writeLock().lock();
        try
        {
            ssTables_.add(filename);
            SSTable.storeBloomFilter(filename, bf);
            ssTableSize = ssTables_.size();
        }
        finally
        {
          lock_.writeLock().unlock();
        }


        if ((ssTableSize >= threshHold_ && !isCompacting_.get())
            || (isCompacting_.get() && ssTableSize % threshHold_ == 0))
        {
            logger_.debug("Submitting for  compaction ...");
            MinorCompactionManager.instance().submit(ColumnFamilyStore.this);
            logger_.debug("Submitted for compaction ...");
        }
    }


    PriorityQueue<FileStruct> initializePriorityQueue(List<String> files, List<Range> ranges, int minBufferSize)
    {
        PriorityQueue<FileStruct> pq = new PriorityQueue<FileStruct>();
        if (files.size() > 1 || (ranges != null &&  files.size() > 0))
        {
            int bufferSize = Math.min( (ColumnFamilyStore.compactionMemoryThreshold_ / files.size()), minBufferSize ) ;
            FileStruct fs = null;
            for (String file : files)
            {
              try
              {
                fs = new FileStruct(SequenceFile.bufferedReader(file, bufferSize), StorageService.getPartitioner());
                  fs.advance();
                  if(fs.isExhausted())
                    continue;
                  pq.add(fs);
              }
              catch ( Exception ex)
              {
                    logger_.warn("corrupt file?  or are you just blowing away data files manually out from under me?", ex);
                try
                {
                  if (fs != null)
                  {
                    fs.close();
                  }
                }
                catch(Exception e)
                {
                  logger_.warn("Unable to close file :" + file);
                }
                }
            }
        }
        return pq;
    }


    /*
     * Group files of similar size into buckets.
     */
    static Set<List<String>> getCompactionBuckets(List<String> files, long min)
    {
      Map<List<String>, Long> buckets = new ConcurrentHashMap<List<String>, Long>();
      for(String fname : files)
      {
        File f = new File(fname);
        long size = f.length();


        boolean bFound = false;
            // look for a bucket containing similar-sized files:
            // group in the same bucket if it's w/in 50% of the average for this bucket,
            // or this file and the bucket are all considered "small" (less than `min`)
            for (List<String> bucket : buckets.keySet())
        {
                long averageSize = buckets.get(bucket);
                if ((size > averageSize/2 && size < 3*averageSize/2)
                    || ( size < min && averageSize < min))
          {
                    // remove and re-add because adding changes the hash
                    buckets.remove(bucket);
            averageSize = (averageSize + size) / 2 ;
                    bucket.add(fname);
                    buckets.put(bucket, averageSize);
            bFound = true;
            break;
          }
        }
            // no similar bucket found; put it in a new one
        if(!bFound)
        {
                ArrayList<String> bucket = new ArrayList<String>();
                bucket.add(fname);
                buckets.put(bucket, size);
        }
      }


        return buckets.keySet();
    }


    /*
     * Break the files into buckets and then compact.
     */
    void doCompaction() throws IOException
    {
        isCompacting_.set(true);
        List<String> files = new ArrayList<String>(ssTables_);
        try
        {
          int count;
        for(List<String> fileList : getCompactionBuckets(files, 50L*1024L*1024L))
            {
          Collections.sort( fileList , new FileNameComparator( FileNameComparator.Ascending));
          if(fileList.size() >= threshHold_ )
          {
            files.clear();
            count = 0;
            for(String file : fileList)
            {
              files.add(file);
              count++;
              if( count == threshHold_ )
                break;
            }
                    // For each bucket if it has crossed the threshhold do the compaction
                    // In case of range  compaction merge the counting bloom filters also.
                    if( count == threshHold_)
                        doFileCompaction(files, bufSize_);
          }
        }
        }
        finally
        {
          isCompacting_.set(false);
        }
    }


    void doMajorCompaction(long skip)
    {
      doMajorCompactionInternal( skip );
    }


    /*
     * Compact all the files irrespective of the size.
     * skip : is the ammount in Gb of the files to be skipped
     * all files greater than skip GB are skipped for this compaction.
     * Except if skip is 0 , in that case this is ignored and all files are taken.
     */
    void doMajorCompactionInternal(long skip)
    {
        isCompacting_.set(true);
        List<String> filesInternal = new ArrayList<String>(ssTables_);
        List<String> files;
        try
        {
           if( skip > 0L )
           {
             files = new ArrayList<String>();
             for ( String file : filesInternal )
             {
               File f = new File(file);
               if( f.length() < skip*1024L*1024L*1024L )
               {
                 files.add(file);
               }
             }
           }
           else
           {
             files = filesInternal;
           }
           doFileCompaction(files, bufSize_);
        }
        catch ( Exception ex)
        {
          ex.printStackTrace();
        }
        finally
        {
          isCompacting_.set(false);
        }
    }


    /*
     * Add up all the files sizes this is the worst case file
     * size for compaction of all the list of files given.
     */
    long getExpectedCompactedFileSize(List<String> files)
    {
      long expectedFileSize = 0;
      for(String file : files)
      {
        File f = new File(file);
        long size = f.length();
        expectedFileSize = expectedFileSize + size;
      }
      return expectedFileSize;
    }


    /*
     *  Find the maximum size file in the list .
     */
    String getMaxSizeFile( List<String> files )
    {
      long maxSize = 0L;
      String maxFile = null;
      for ( String file : files )
      {
        File f = new File(file);
        if(f.length() > maxSize )
        {
          maxSize = f.length();
          maxFile = file;
        }
      }
      return maxFile;
    }


    boolean doAntiCompaction(List<Range> ranges, EndPoint target, List<String> fileList)
    {
        isCompacting_.set(true);
        List<String> files = new ArrayList<String>(ssTables_);
        boolean result = true;
        try
        {
           result = doFileAntiCompaction(files, ranges, target, fileList, null);
        }
        catch ( Exception ex)
        {
          ex.printStackTrace();
        }
        finally
        {
          isCompacting_.set(false);
        }
        return result;


    }


    void forceCleanup()
    {
      MinorCompactionManager.instance().submitCleanup(ColumnFamilyStore.this);
    }
    
    /**
     * This function goes over each file and removes the keys that the node is not responsible for 
     * and only keeps keys that this node is responsible for.
     * @throws IOException
     */
    void doCleanupCompaction()
    {
        isCompacting_.set(true);
        List<String> files = new ArrayList<String>(ssTables_);
        for(String file: files)
        {
          try
          {
            doCleanup(file);
          }
          catch ( Exception ex)
          {
            ex.printStackTrace();
          }
        }
      isCompacting_.set(false);
    }
    /**
     * cleans up one particular file by removing keys that this node is not responsible for.
     * @param file
     * @throws IOException
     */
    /* TODO: Take care of the comments later. */
    void doCleanup(String file)
    {
      if(file == null )
        return;
        List<Range> myRanges;
      List<String> files = new ArrayList<String>();
      files.add(file);
      List<String> newFiles = new ArrayList<String>();
      Map<EndPoint, List<Range>> endPointtoRangeMap = StorageService.instance().constructEndPointToRangesMap();
      myRanges = endPointtoRangeMap.get(StorageService.getLocalStorageEndPoint());
      List<BloomFilter> compactedBloomFilters = new ArrayList<BloomFilter>();
        doFileAntiCompaction(files, myRanges, null, newFiles, compactedBloomFilters);
        logger_.debug("Original file : " + file + " of size " + new File(file).length());
        lock_.writeLock().lock();
        try
        {
            ssTables_.remove(file);
            SSTable.removeAssociatedBloomFilter(file);
            for (String newfile : newFiles)
            {                              
                logger_.debug("New file : " + newfile + " of size " + new File(newfile).length());
                if ( newfile != null )
                {
                    ssTables_.add(newfile);
                    logger_.debug("Inserting bloom filter for file " + newfile);
                    SSTable.storeBloomFilter(newfile, compactedBloomFilters.get(0));
                }
            }
            SSTable.delete(file);
        }
        finally
        {
            lock_.writeLock().unlock();
        }
    }
    
    /**
     * This function is used to do the anti compaction process , it spits out the file which has keys that belong to a given range
     * If the target is not specified it spits out the file as a compacted file with the unecessary ranges wiped out.
     * @param files
     * @param ranges
     * @param target
     * @param fileList
     * @return
     * @throws IOException
     */
    boolean doFileAntiCompaction(List<String> files, List<Range> ranges, EndPoint target, List<String> fileList, List<BloomFilter> compactedBloomFilters)
    {
      boolean result = false;
        long startTime = System.currentTimeMillis();
        long totalBytesRead = 0;
        long totalBytesWritten = 0;
        long totalkeysRead = 0;
        long totalkeysWritten = 0;
        String rangeFileLocation;
        String mergedFileName;
        IPartitioner p = StorageService.getPartitioner();
        try
        {
          // Calculate the expected compacted filesize
        long expectedRangeFileSize = getExpectedCompactedFileSize(files);
        /* in the worst case a node will be giving out alf of its data so we take a chance */
        expectedRangeFileSize = expectedRangeFileSize / 2;
          rangeFileLocation = DatabaseDescriptor.getCompactionFileLocation(expectedRangeFileSize);
//          boolean isLoop = isLoopAround( ranges );
//          Range maxRange = getMaxRange( ranges );
          // If the compaction file path is null that means we have no space left for this compaction.
          if( rangeFileLocation == null )
          {
              logger_.warn("Total bytes to be written for range compaction  ..."
                      + expectedRangeFileSize + "   is greater than the safe limit of the disk space available.");
              return result;
          }
          PriorityQueue<FileStruct> pq = initializePriorityQueue(files, ranges, ColumnFamilyStore.bufSize_);
          if (pq.size() > 0)
          {
              mergedFileName = getTempFileName();
              SSTable ssTableRange = null ;
              String lastkey = null;
              List<FileStruct> lfs = new ArrayList<FileStruct>();
              DataOutputBuffer bufOut = new DataOutputBuffer();
              int expectedBloomFilterSize = SSTable.getApproximateKeyCount(files);
              expectedBloomFilterSize = (expectedBloomFilterSize > 0) ? expectedBloomFilterSize : SSTable.indexInterval();
              logger_.debug("Expected bloom filter size : " + expectedBloomFilterSize);
              /* Create the bloom filter for the compacted file. */
              BloomFilter compactedRangeBloomFilter = new BloomFilter(expectedBloomFilterSize, 15);
              List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>();


              while (pq.size() > 0 || lfs.size() > 0)
              {
                  FileStruct fs = null;
                  if (pq.size() > 0)
                  {
                      fs = pq.poll();
                  }
                  if (fs != null
                          && (lastkey == null || lastkey.equals(fs.getKey())))
                  {
                      // The keys are the same so we need to add this to the
                      // ldfs list
                      lastkey = fs.getKey();
                      lfs.add(fs);
                  }
                  else
                  {
                      Collections.sort(lfs, new FileStructComparator());
                      ColumnFamily columnFamily;
                      bufOut.reset();
                      if(lfs.size() > 1)
                      {
                        for (FileStruct filestruct : lfs)
                        {
                          try
                          {
                                  /* read the length although we don't need it */
                                  filestruct.getBufIn().readInt();
                                  // Skip the Index
                                    IndexHelper.skipBloomFilterAndIndex(filestruct.getBufIn());
                                  // We want to add only 2 and resolve them right there in order to save on memory footprint
                                  if(columnFamilies.size() > 1)
                                  {
                                  // Now merge the 2 column families
                                        merge(columnFamilies);
                                  }
                              // deserialize into column families
                              columnFamilies.add(ColumnFamily.serializer().deserialize(filestruct.getBufIn()));
                          }
                          catch ( Exception ex)
                          {
                                    logger_.warn(LogUtil.throwableToString(ex));
                                }
                        }
                        // Now after merging all crap append to the sstable
                        columnFamily = resolveAndRemoveDeleted(columnFamilies);
                        columnFamilies.clear();
                        if( columnFamily != null )
                        {
                        /* serialize the cf with column indexes */
                          ColumnFamily.serializerWithIndexes().serialize(columnFamily, bufOut);
                        }
                      }
                      else
                      {
                        FileStruct filestruct = lfs.get(0);
                        try
                        {
                            /* read the length although we don't need it */
                            int size = filestruct.getBufIn().readInt();
                            bufOut.write(filestruct.getBufIn(), size);
                        }
                        catch ( Exception ex)
                        {
                          logger_.warn(LogUtil.throwableToString(ex));
                              filestruct.close();
                              continue;
                        }
                      }
                      if ( Range.isKeyInRanges(ranges, p.undecorateKey(lastkey)) )
                      {
                          if(ssTableRange == null )
                          {
                            if( target != null )
                              rangeFileLocation = rangeFileLocation + System.getProperty("file.separator") + "bootstrap";
                            FileUtils.createDirectory(rangeFileLocation);
                              ssTableRange = new SSTable(rangeFileLocation, mergedFileName);
                          }                          
                          try
                          {
                            ssTableRange.append(lastkey, bufOut);
                            compactedRangeBloomFilter.add(lastkey);
                          }
                          catch(Exception ex)
                          {
                              logger_.warn( LogUtil.throwableToString(ex) );
                          }
                      }
                      totalkeysWritten++;
                      for (FileStruct filestruct : lfs)
                      {
                        try
                        {
                                filestruct.advance();
                          if (filestruct.isExhausted())
                          {
                            continue;
                          }
                          /* keep on looping until we find a key in the range */
                              while ( !Range.isKeyInRanges(ranges, p.undecorateKey(filestruct.getKey())) )
                              {
                                    filestruct.advance();
                                    if (filestruct.isExhausted())
                            {
                              break;
                            }
                                /* check if we need to continue , if we are done with ranges empty the queue and close all file handles and exit */
                                //if( !isLoop && StorageService.hash(filestruct.key).compareTo(maxRange.right()) > 0 && !filestruct.key.equals(""))
                                //{
                                      //filestruct.reader.close();
                                      //filestruct = null;
                                      //break;
                                //}
                              }
                              if (!filestruct.isExhausted())
                              {
                                pq.add(filestruct);
                              }
                            totalkeysRead++;
                        }
                        catch ( Exception ex )
                        {
                          // Ignore the exception as it might be a corrupted file
                          // in any case we have read as far as possible from it
                          // and it will be deleted after compaction.
                                logger_.warn(LogUtil.throwableToString(ex));
                              filestruct.close();
                            }
                      }
                      lfs.clear();
                      lastkey = null;
                      if (fs != null)
                      {
                          // Add back the fs since we processed the rest of
                          // filestructs
                          pq.add(fs);
                      }
                  }
              }
              if( ssTableRange != null )
              {
                    if ( fileList == null )
                        fileList = new ArrayList<String>();
                    ssTableRange.closeRename(compactedRangeBloomFilter, fileList);
                    if(compactedBloomFilters != null)
                      compactedBloomFilters.add(compactedRangeBloomFilter);
              }
          }
        }
        catch ( Exception ex)
        {
            logger_.warn( LogUtil.throwableToString(ex) );
        }
        logger_.debug("Total time taken for range split   ..."
                + (System.currentTimeMillis() - startTime));
        logger_.debug("Total bytes Read for range split  ..." + totalBytesRead);
        logger_.debug("Total bytes written for range split  ..."
                + totalBytesWritten + "   Total keys read ..." + totalkeysRead);
        return result;
    }


    private void doFill(BloomFilter bf, String decoratedKey)
    {
        bf.add(StorageService.getPartitioner().undecorateKey(decoratedKey));
    }
    
    /*
     * This function does the actual compaction for files.
     * It maintains a priority queue of with the first key from each file
     * and then removes the top of the queue and adds it to the SStable and
     * repeats this process while reading the next from each file until its
     * done with all the files . The SStable to which the keys are written
     * represents the new compacted file. Before writing if there are keys
     * that occur in multiple files and are the same then a resolution is done
     * to get the latest data.
     *
     */
    void  doFileCompaction(List<String> files,  int minBufferSize) throws IOException
    {
      String newfile = null;
        long startTime = System.currentTimeMillis();
        long totalBytesRead = 0;
        long totalBytesWritten = 0;
        long totalkeysRead = 0;
        long totalkeysWritten = 0;
        // Calculate the expected compacted filesize
        long expectedCompactedFileSize = getExpectedCompactedFileSize(files);
        String compactionFileLocation = DatabaseDescriptor.getCompactionFileLocation(expectedCompactedFileSize);
        // If the compaction file path is null that means we have no space left for this compaction.
        if( compactionFileLocation == null )
        {
            String maxFile = getMaxSizeFile( files );
            files.remove( maxFile );
            doFileCompaction(files , minBufferSize);
            return;
        }
        PriorityQueue<FileStruct> pq = initializePriorityQueue(files, null, minBufferSize);
        if (pq.size() > 0)
        {
            String mergedFileName = getTempFileName( files );
            SSTable ssTable = null;
            String lastkey = null;
            List<FileStruct> lfs = new ArrayList<FileStruct>();
            DataOutputBuffer bufOut = new DataOutputBuffer();
            int expectedBloomFilterSize = SSTable.getApproximateKeyCount(files);
            expectedBloomFilterSize = (expectedBloomFilterSize > 0) ? expectedBloomFilterSize : SSTable.indexInterval();
            logger_.debug("Expected bloom filter size : " + expectedBloomFilterSize);
            /* Create the bloom filter for the compacted file. */
            BloomFilter compactedBloomFilter = new BloomFilter(expectedBloomFilterSize, 15);
            List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>();


            while (pq.size() > 0 || lfs.size() > 0)
            {
                FileStruct fs = null;
                if (pq.size() > 0)
                {
                    fs = pq.poll();
                }
                if (fs != null
                        && (lastkey == null || lastkey.equals(fs.getKey())))
                {
                    // The keys are the same so we need to add this to the
                    // ldfs list
                    lastkey = fs.getKey();
                    lfs.add(fs);
                }
                else
                {
                    Collections.sort(lfs, new FileStructComparator());
                    ColumnFamily columnFamily;
                    bufOut.reset();
                    if(lfs.size() > 1)
                    {
                        for (FileStruct filestruct : lfs)
                        {
                            try
                            {
                                /* read the length although we don't need it */
                                filestruct.getBufIn().readInt();
                                // Skip the Index
                                IndexHelper.skipBloomFilterAndIndex(filestruct.getBufIn());
                                // We want to add only 2 and resolve them right there in order to save on memory footprint
                                if(columnFamilies.size() > 1)
                                {
                                    merge(columnFamilies);
                                }
                                // deserialize into column families
                                columnFamilies.add(ColumnFamily.serializer().deserialize(filestruct.getBufIn()));
                            }
                            catch ( Exception ex)
                            {
                                logger_.warn("error in filecompaction", ex);
                            }
                        }
                        // Now after merging all crap append to the sstable
                        columnFamily = resolveAndRemoveDeleted(columnFamilies);
                        columnFamilies.clear();
                        if( columnFamily != null )
                        {
                            /* serialize the cf with column indexes */
                            ColumnFamily.serializerWithIndexes().serialize(columnFamily, bufOut);
                        }
                    }
                    else
                    {
                        FileStruct filestruct = lfs.get(0);
                        try
                        {
                            /* read the length although we don't need it */
                            int size = filestruct.getBufIn().readInt();
                            bufOut.write(filestruct.getBufIn(), size);
                        }
                        catch ( Exception ex)
                        {
                            ex.printStackTrace();
                            filestruct.close();
                            continue;
                        }
                    }


                    if ( ssTable == null )
                    {
                        ssTable = new SSTable(compactionFileLocation, mergedFileName);
                    }
                    ssTable.append(lastkey, bufOut);


                    /* Fill the bloom filter with the key */
                    doFill(compactedBloomFilter, lastkey);
                    totalkeysWritten++;
                    for (FileStruct filestruct : lfs)
                    {
                        try
                        {
                            filestruct.advance();
                            if (filestruct.isExhausted())
                            {
                                continue;
                            }
                            pq.add(filestruct);
                            totalkeysRead++;
                        }
                        catch ( Throwable ex )
                        {
                            // Ignore the exception as it might be a corrupted file
                            // in any case we have read as far as possible from it
                            // and it will be deleted after compaction.
                            filestruct.close();
                        }
                    }
                    lfs.clear();
                    lastkey = null;
                    if (fs != null)
                    {
                        /* Add back the fs since we processed the rest of filestructs */
                        pq.add(fs);
                    }
                }
            }
            if ( ssTable != null )
            {
                ssTable.closeRename(compactedBloomFilter);
                newfile = ssTable.getDataFileLocation();
            }
            lock_.writeLock().lock();
            try
            {
                for (String file : files)
                {
                    ssTables_.remove(file);
                    SSTable.removeAssociatedBloomFilter(file);
                }
                if ( newfile != null )
                {
                    ssTables_.add(newfile);
                    logger_.debug("Inserting bloom filter for file " + newfile);
                    SSTable.storeBloomFilter(newfile, compactedBloomFilter);
                    totalBytesWritten = (new File(newfile)).length();
                }
            }
            finally
            {
                lock_.writeLock().unlock();
            }
            for (String file : files)
            {
                SSTable.delete(file);
            }
        }
        logger_.debug("Total time taken for compaction  ..."
                + (System.currentTimeMillis() - startTime));
        logger_.debug("Total bytes Read for compaction  ..." + totalBytesRead);
        logger_.debug("Total bytes written for compaction  ..."
                + totalBytesWritten + "   Total keys read ..." + totalkeysRead);
    }


    public boolean isSuper()
    {
        return isSuper_;
    }


    public void flushMemtableOnRecovery() throws IOException
    {
        memtable_.get().flushOnRecovery();
    }


    public int getMemtableColumnsCount()
    {
        return memtable_.get().getCurrentObjectCount();
    }


    public int getMemtableDataSize()
    {
        return memtable_.get().getCurrentSize();
    }


    public int getMemtableSwitchCount()
    {
        return memtableSwitchCount;
    }


    /**
     * clears out all data associated with this ColumnFamily.
     * For use in testing.
     */
    public void reset() throws IOException, ExecutionException, InterruptedException
    {
        forceBlockingFlush();
        for (String fName : ssTables_)
        {
            new File(fName).delete();
        }
        ssTables_.clear();
    }
}
Source Code of org.apache.cassandra.db.ColumnFamilyStore

Related Classes of org.apache.cassandra.db.ColumnFamilyStore