Package org.voltdb

Source Code of org.voltdb.SnapshotSiteProcessor

/* This file is part of VoltDB.
* Copyright (C) 2008-2010 VoltDB Inc.
*
* VoltDB is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* VoltDB is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
*/

package org.voltdb;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.log4j.Logger;
import org.voltdb.jni.ExecutionEngine;
import org.voltdb.utils.DBBPool.BBContainer;

import edu.brown.hstore.HStore;

/**
* Encapsulates the state needed to manage an ongoing snapshot at the
* per-execution site level. Also contains some static global snapshot
* counters. This class requires callers to maintain thread safety;
* generally (exclusively?) it is driven by ExecutionSite, each of
* which has a SnapshotSiteProcessor.
*/
public class SnapshotSiteProcessor {
    private static final Logger LOG = Logger.getLogger(SnapshotSiteProcessor.class);

    /** Global count of execution sites on this node performing snapshot */
    public static final AtomicInteger ExecutionSitesCurrentlySnapshotting =
        new AtomicInteger(-1);

    /**
     * Ensure the first thread to run the fragment does the creation
     * of the targets and the distribution of the work.
     */
    public static final Semaphore m_snapshotCreateSetupPermit =
        new Semaphore(1);

    /**
     * Only proceed once permits are available after setup completes
     */
    public static Semaphore m_snapshotPermits = new Semaphore(0);

    /**
     * Global collection populated by snapshot creator, poll'd by individual sites
     */
    public static final LinkedList<Deque<SnapshotTableTask>> m_taskListsForSites =
        new LinkedList<Deque<SnapshotTableTask>>();


    /** Number of snapshot buffers to keep */
    static final int m_numSnapshotBuffers = 8;

    /**
     * Pick a buffer length that is big enough to store at least one of the largest size tuple supported
     * in the system (2 megabytes). Add a fudge factor for metadata.
     */
    public static final int m_snapshotBufferLength = (1024 * 1024 * 2) + Short.MAX_VALUE;
    private final ArrayList<BBContainer> m_snapshotBufferOrigins =
        new ArrayList<BBContainer>();
    /**
     * Set to true when the buffer is sent to a SnapshotDataTarget for I/O
     * and back to false when the container is discarded.
     * A volatile allows the EE to check for the buffer without
     * synchronization when the snapshot is done online.
     */
    private final ConcurrentLinkedQueue<BBContainer> m_availableSnapshotBuffers
        = new ConcurrentLinkedQueue<BBContainer>();

    /**
     * The last EE out has to shut off the lights. Cache a list
     * of targets in case this EE ends up being the one that needs
     * to close each target.
     */
    private ArrayList<SnapshotDataTarget> m_snapshotTargets;

    /**
     * Queue of tasks for tables that still need to be snapshotted.
     * This is polled from until there are no more tasks.
     */
    private ArrayDeque<SnapshotTableTask> m_snapshotTableTasks;


    /**
     * List of threads to join to block on snapshot completion
     * when using completeSnapshotWork().
     */
    private ArrayList<Thread> m_snapshotTargetTerminators = null;

    /**
     * When a buffer is returned to the pool this is invoked to ensure the EE wakes up
     * and does any potential snapshot work with that buffer
     */
    private final Runnable m_onPotentialSnapshotWork;
   
    /**
     * finish only after digest written
     */
    public static AtomicBoolean m_digestWritten = new AtomicBoolean(false);

    /**
     * only one partion does createSetup in SnapshotSaveAPI
     */
    public static AtomicBoolean m_finishedSetup = new AtomicBoolean(false);

   
    /**
     * A class identifying a table that should be snapshotted as well as the destination
     * for the resulting tuple blocks
     */
    public static class SnapshotTableTask {
        private final int m_tableId;
        private final SnapshotDataTarget m_target;
        private final boolean m_isReplicated;
        private final String m_name;

        public SnapshotTableTask(
                final int tableId,
                final SnapshotDataTarget target,
                boolean isReplicated,
                final String tableName) {
            m_tableId = tableId;
            m_target = target;
            m_isReplicated = isReplicated;
            m_name = tableName;
        }

        @Override
        public String toString() {
            return ("SnapshotTableTask for " + m_name );
        }
    }

    public SnapshotSiteProcessor(Runnable onPotentialSnapshotWork) {
        m_onPotentialSnapshotWork = onPotentialSnapshotWork;
        initializeBufferPool();
    }

    public void shutdown() {
        for (BBContainer c : m_snapshotBufferOrigins ) {
            c.discard();
        }
        m_snapshotBufferOrigins.clear();
        m_availableSnapshotBuffers.clear();
    }

    void initializeBufferPool() {
        for (int ii = 0; ii < SnapshotSiteProcessor.m_numSnapshotBuffers; ii++) {
            final BBContainer origin = org.voltdb.utils.DBBPool.allocateDirect(m_snapshotBufferLength);
            m_snapshotBufferOrigins.add(origin);
            long snapshotBufferAddress = 0;
            if (VoltDB.getLoadLibVOLTDB()) {
                snapshotBufferAddress = org.voltdb.utils.DBBPool.getBufferAddress(origin.b);
            }
            m_availableSnapshotBuffers.offer(new BBContainer(origin.b, snapshotBufferAddress) {
                @Override
                public void discard() {
                    m_availableSnapshotBuffers.offer(this);
                    m_onPotentialSnapshotWork.run();
                }
            });
        }
    }

    public void initiateSnapshots(ExecutionEngine ee, Deque<SnapshotTableTask> tasks) {
        LOG.trace("initiateSnapshots at : partition "+ee.getPartitionExecutor().getPartitionId()+ " tasks size ::"+tasks.size());
       
        m_snapshotTableTasks = new ArrayDeque<SnapshotTableTask>(tasks);
        m_snapshotTargets = new ArrayList<SnapshotDataTarget>();
        for (final SnapshotTableTask task : tasks) {
            if (!task.m_isReplicated) {
                assert(task != null);
                assert(m_snapshotTargets != null);
                m_snapshotTargets.add(task.m_target);
            }
            // FIXME meng
           if (!ee.activateTableStream(task.m_tableId, TableStreamType.SNAPSHOT )) {
               LOG.error("Attempted to activate copy on write mode for table "
                       + task.m_name + " and failed");
               LOG.error(task);
               HStore.crashDB();
           }
           else{
               LOG.trace("Activated COW mode for table "+task.m_name+" at partition "+ee.getPartitionExecutor().getPartitionId());
           }
        }
    }

    public Future<?> doSnapshotWork(ExecutionEngine ee) {
        Future<?> retval = null;

        /*
         * This thread will null out the reference to m_snapshotTableTasks when
         * a snapshot is finished. If the snapshot buffer is loaned out that means
         * it is pending I/O somewhere so there is no work to do until it comes back.
         */
        if (m_snapshotTableTasks == null || m_availableSnapshotBuffers.isEmpty()) {
            return retval;
        }
       
        int partition_id = ee.getPartitionExecutor().getPartitionId();
        LOG.trace("doSnapshotWork at : partition "+ partition_id);

        /*
         * There definitely is snapshot work to do. There should be a task
         * here. If there isn't something is wrong because when the last task
         * is polled cleanup and nulling should occur.
         */
        while (!m_snapshotTableTasks.isEmpty()) {       
            final SnapshotTableTask currentTask = m_snapshotTableTasks.peek();           
            assert(currentTask != null);
            LOG.trace("SNAPSHOT TASK : "+currentTask+ " on partition :"+ partition_id+ " Target :"+currentTask.m_target);

            final int headerSize = currentTask.m_target.getHeaderSize();
            final BBContainer snapshotBuffer = m_availableSnapshotBuffers.poll();
            assert(snapshotBuffer != null);
            snapshotBuffer.b.clear();
            snapshotBuffer.b.position(headerSize);
            int serialized = 0;
           
           
            //FIXME (meng)
            serialized = ee.tableStreamSerializeMore(
                   snapshotBuffer,
                   currentTask.m_tableId,
                   TableStreamType.SNAPSHOT);

            if (serialized < 0) {
                LOG.error("Failure while serialize data from a table for COW snapshot");
                HStore.crashDB();
            }
            else{
                LOG.trace("Serialized "+serialized+ " bytes for table "+currentTask.m_name+" at partition "+ partition_id);               
            }

            /**
             * The EE will return 0 when there is no more data left to pull from that table.
             * The enclosing loop ensures that the next table is then addressed.
             */
            if (serialized == 0) {
                final SnapshotTableTask t = m_snapshotTableTasks.poll();
                /**
                 * Replicated tables are assigned to a single ES on each site and that ES
                 * is responsible for closing the data target. Done in a separate
                 * thread so the EE can continue working.
                 */
                if (t.m_isReplicated) {
                    final Thread terminatorThread =
                        new Thread("Replicated SnapshotDataTarget terminator ") {
                        @Override
                        public void run() {
                            try {
                                t.m_target.close();
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }

                        }
                    };
                    if (m_snapshotTargetTerminators != null) {
                        m_snapshotTargetTerminators.add(terminatorThread);
                    }
                    terminatorThread.start();
                }
                m_availableSnapshotBuffers.offer(snapshotBuffer);
                continue;
            }

            /**
             * The block from the EE will contain raw tuple data with no length prefix etc.
             */
            snapshotBuffer.b.limit(headerSize + serialized);
            snapshotBuffer.b.position(0);
            retval = currentTask.m_target.write(snapshotBuffer);
            break;
        }

        /**
         * If there are no more tasks then this particular EE is finished doing snapshot work
         * Check the AtomicInteger to find out if this is the last one.
         */
        if (m_snapshotTableTasks.isEmpty()) {
            final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets;
            m_snapshotTargets = null;
            m_snapshotTableTasks = null;
            final int result = ExecutionSitesCurrentlySnapshotting.decrementAndGet();
            LOG.trace("ExecutionSitesCurrentlySnapshotting final dec and get :"+SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get());               
           
            /**
             * If this is the last one then this EE must close all the SnapshotDataTargets.
             * Done in a separate thread so the EE can go and do other work. It will
             * sync every file descriptor and that may block for a while.
             */

            final Thread terminatorThread = new Thread("Snapshot terminator") {
                @Override
                public void run() {                   
                    for (final SnapshotDataTarget t : snapshotTargets) {
                        try {
                            t.close();
                        } catch (IOException e) {
                            throw new RuntimeException(e);
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                    }
                }
            };

            if (m_snapshotTargetTerminators != null) {
                m_snapshotTargetTerminators.add(terminatorThread);
            }

            terminatorThread.start();
          
        }
        return retval;
    }

    /*
     * Do snapshot work exclusively until there is no more. Also blocks
     * until the fsync() and close() of snapshot data targets has completed.
     */
    public HashSet<Exception> completeSnapshotWork(ExecutionEngine ee) throws InterruptedException {
        HashSet<Exception> retval = new HashSet<Exception>();
        m_snapshotTargetTerminators = new ArrayList<Thread>();

        LOG.trace("completeSnapshotWork starts at partition :"+ee.getPartitionExecutor().getPartitionId());

        while (m_snapshotTableTasks != null) {
            Future<?> result = doSnapshotWork(ee);
            if (result != null) {
                try {
                    result.get();
                } catch (ExecutionException e) {
                    final boolean added = retval.add((Exception)e.getCause());
                    assert(added);
                } catch (Exception e) {
                    final boolean added = retval.add((Exception)e.getCause());
                    assert(added);
                }
            }
        }

        /**
         * Block until the sync has actually occurred in the forked threads.
         * The threads are spawned even in the blocking case to keep it simple.
         */
        for (final Thread t : m_snapshotTargetTerminators) {
            t.join();
        }
        m_snapshotTargetTerminators = null;
       
        /**
         * Set it to -1 indicating the system is ready to
         * perform another snapshot. Changed to wait until all
         * the previous snapshot work has finished so that
         * snapshot initiation doesn't wait on the file system
         */
        synchronized (SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting) {
            if(ExecutionSitesCurrentlySnapshotting.get() == 0){
                ExecutionSitesCurrentlySnapshotting.set(-1);
                LOG.trace("ExecutionSitesCurrentlySnapshotting reset :"+SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get());
            }
        }
       
        LOG.trace("completeSnapshotWork ends at partition :"+ee.getPartitionExecutor().getPartitionId());
        return retval;
    }
}
TOP

Related Classes of org.voltdb.SnapshotSiteProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.