/* This file is part of VoltDB.
* Copyright (C) 2008-2010 VoltDB Inc.
*
* VoltDB is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* VoltDB is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.log4j.Logger;
import org.voltdb.jni.ExecutionEngine;
import org.voltdb.utils.DBBPool.BBContainer;
import edu.brown.hstore.HStore;
/**
* Encapsulates the state needed to manage an ongoing snapshot at the
* per-execution site level. Also contains some static global snapshot
* counters. This class requires callers to maintain thread safety;
* generally (exclusively?) it is driven by ExecutionSite, each of
* which has a SnapshotSiteProcessor.
*/
public class SnapshotSiteProcessor {
private static final Logger LOG = Logger.getLogger(SnapshotSiteProcessor.class);
/** Global count of execution sites on this node performing snapshot */
public static final AtomicInteger ExecutionSitesCurrentlySnapshotting =
new AtomicInteger(-1);
/**
* Ensure the first thread to run the fragment does the creation
* of the targets and the distribution of the work.
*/
public static final Semaphore m_snapshotCreateSetupPermit =
new Semaphore(1);
/**
* Only proceed once permits are available after setup completes
*/
public static Semaphore m_snapshotPermits = new Semaphore(0);
/**
* Global collection populated by snapshot creator, poll'd by individual sites
*/
public static final LinkedList<Deque<SnapshotTableTask>> m_taskListsForSites =
new LinkedList<Deque<SnapshotTableTask>>();
/** Number of snapshot buffers to keep */
static final int m_numSnapshotBuffers = 8;
/**
* Pick a buffer length that is big enough to store at least one of the largest size tuple supported
* in the system (2 megabytes). Add a fudge factor for metadata.
*/
public static final int m_snapshotBufferLength = (1024 * 1024 * 2) + Short.MAX_VALUE;
private final ArrayList<BBContainer> m_snapshotBufferOrigins =
new ArrayList<BBContainer>();
/**
* Set to true when the buffer is sent to a SnapshotDataTarget for I/O
* and back to false when the container is discarded.
* A volatile allows the EE to check for the buffer without
* synchronization when the snapshot is done online.
*/
private final ConcurrentLinkedQueue<BBContainer> m_availableSnapshotBuffers
= new ConcurrentLinkedQueue<BBContainer>();
/**
* The last EE out has to shut off the lights. Cache a list
* of targets in case this EE ends up being the one that needs
* to close each target.
*/
private ArrayList<SnapshotDataTarget> m_snapshotTargets;
/**
* Queue of tasks for tables that still need to be snapshotted.
* This is polled from until there are no more tasks.
*/
private ArrayDeque<SnapshotTableTask> m_snapshotTableTasks;
/**
* List of threads to join to block on snapshot completion
* when using completeSnapshotWork().
*/
private ArrayList<Thread> m_snapshotTargetTerminators = null;
/**
* When a buffer is returned to the pool this is invoked to ensure the EE wakes up
* and does any potential snapshot work with that buffer
*/
private final Runnable m_onPotentialSnapshotWork;
/**
* finish only after digest written
*/
public static AtomicBoolean m_digestWritten = new AtomicBoolean(false);
/**
* only one partion does createSetup in SnapshotSaveAPI
*/
public static AtomicBoolean m_finishedSetup = new AtomicBoolean(false);
/**
* A class identifying a table that should be snapshotted as well as the destination
* for the resulting tuple blocks
*/
public static class SnapshotTableTask {
private final int m_tableId;
private final SnapshotDataTarget m_target;
private final boolean m_isReplicated;
private final String m_name;
public SnapshotTableTask(
final int tableId,
final SnapshotDataTarget target,
boolean isReplicated,
final String tableName) {
m_tableId = tableId;
m_target = target;
m_isReplicated = isReplicated;
m_name = tableName;
}
@Override
public String toString() {
return ("SnapshotTableTask for " + m_name );
}
}
public SnapshotSiteProcessor(Runnable onPotentialSnapshotWork) {
m_onPotentialSnapshotWork = onPotentialSnapshotWork;
initializeBufferPool();
}
public void shutdown() {
for (BBContainer c : m_snapshotBufferOrigins ) {
c.discard();
}
m_snapshotBufferOrigins.clear();
m_availableSnapshotBuffers.clear();
}
void initializeBufferPool() {
for (int ii = 0; ii < SnapshotSiteProcessor.m_numSnapshotBuffers; ii++) {
final BBContainer origin = org.voltdb.utils.DBBPool.allocateDirect(m_snapshotBufferLength);
m_snapshotBufferOrigins.add(origin);
long snapshotBufferAddress = 0;
if (VoltDB.getLoadLibVOLTDB()) {
snapshotBufferAddress = org.voltdb.utils.DBBPool.getBufferAddress(origin.b);
}
m_availableSnapshotBuffers.offer(new BBContainer(origin.b, snapshotBufferAddress) {
@Override
public void discard() {
m_availableSnapshotBuffers.offer(this);
m_onPotentialSnapshotWork.run();
}
});
}
}
public void initiateSnapshots(ExecutionEngine ee, Deque<SnapshotTableTask> tasks) {
LOG.trace("initiateSnapshots at : partition "+ee.getPartitionExecutor().getPartitionId()+ " tasks size ::"+tasks.size());
m_snapshotTableTasks = new ArrayDeque<SnapshotTableTask>(tasks);
m_snapshotTargets = new ArrayList<SnapshotDataTarget>();
for (final SnapshotTableTask task : tasks) {
if (!task.m_isReplicated) {
assert(task != null);
assert(m_snapshotTargets != null);
m_snapshotTargets.add(task.m_target);
}
// FIXME meng
if (!ee.activateTableStream(task.m_tableId, TableStreamType.SNAPSHOT )) {
LOG.error("Attempted to activate copy on write mode for table "
+ task.m_name + " and failed");
LOG.error(task);
HStore.crashDB();
}
else{
LOG.trace("Activated COW mode for table "+task.m_name+" at partition "+ee.getPartitionExecutor().getPartitionId());
}
}
}
public Future<?> doSnapshotWork(ExecutionEngine ee) {
Future<?> retval = null;
/*
* This thread will null out the reference to m_snapshotTableTasks when
* a snapshot is finished. If the snapshot buffer is loaned out that means
* it is pending I/O somewhere so there is no work to do until it comes back.
*/
if (m_snapshotTableTasks == null || m_availableSnapshotBuffers.isEmpty()) {
return retval;
}
int partition_id = ee.getPartitionExecutor().getPartitionId();
LOG.trace("doSnapshotWork at : partition "+ partition_id);
/*
* There definitely is snapshot work to do. There should be a task
* here. If there isn't something is wrong because when the last task
* is polled cleanup and nulling should occur.
*/
while (!m_snapshotTableTasks.isEmpty()) {
final SnapshotTableTask currentTask = m_snapshotTableTasks.peek();
assert(currentTask != null);
LOG.trace("SNAPSHOT TASK : "+currentTask+ " on partition :"+ partition_id+ " Target :"+currentTask.m_target);
final int headerSize = currentTask.m_target.getHeaderSize();
final BBContainer snapshotBuffer = m_availableSnapshotBuffers.poll();
assert(snapshotBuffer != null);
snapshotBuffer.b.clear();
snapshotBuffer.b.position(headerSize);
int serialized = 0;
//FIXME (meng)
serialized = ee.tableStreamSerializeMore(
snapshotBuffer,
currentTask.m_tableId,
TableStreamType.SNAPSHOT);
if (serialized < 0) {
LOG.error("Failure while serialize data from a table for COW snapshot");
HStore.crashDB();
}
else{
LOG.trace("Serialized "+serialized+ " bytes for table "+currentTask.m_name+" at partition "+ partition_id);
}
/**
* The EE will return 0 when there is no more data left to pull from that table.
* The enclosing loop ensures that the next table is then addressed.
*/
if (serialized == 0) {
final SnapshotTableTask t = m_snapshotTableTasks.poll();
/**
* Replicated tables are assigned to a single ES on each site and that ES
* is responsible for closing the data target. Done in a separate
* thread so the EE can continue working.
*/
if (t.m_isReplicated) {
final Thread terminatorThread =
new Thread("Replicated SnapshotDataTarget terminator ") {
@Override
public void run() {
try {
t.m_target.close();
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
};
if (m_snapshotTargetTerminators != null) {
m_snapshotTargetTerminators.add(terminatorThread);
}
terminatorThread.start();
}
m_availableSnapshotBuffers.offer(snapshotBuffer);
continue;
}
/**
* The block from the EE will contain raw tuple data with no length prefix etc.
*/
snapshotBuffer.b.limit(headerSize + serialized);
snapshotBuffer.b.position(0);
retval = currentTask.m_target.write(snapshotBuffer);
break;
}
/**
* If there are no more tasks then this particular EE is finished doing snapshot work
* Check the AtomicInteger to find out if this is the last one.
*/
if (m_snapshotTableTasks.isEmpty()) {
final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets;
m_snapshotTargets = null;
m_snapshotTableTasks = null;
final int result = ExecutionSitesCurrentlySnapshotting.decrementAndGet();
LOG.trace("ExecutionSitesCurrentlySnapshotting final dec and get :"+SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get());
/**
* If this is the last one then this EE must close all the SnapshotDataTargets.
* Done in a separate thread so the EE can go and do other work. It will
* sync every file descriptor and that may block for a while.
*/
final Thread terminatorThread = new Thread("Snapshot terminator") {
@Override
public void run() {
for (final SnapshotDataTarget t : snapshotTargets) {
try {
t.close();
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
};
if (m_snapshotTargetTerminators != null) {
m_snapshotTargetTerminators.add(terminatorThread);
}
terminatorThread.start();
}
return retval;
}
/*
* Do snapshot work exclusively until there is no more. Also blocks
* until the fsync() and close() of snapshot data targets has completed.
*/
public HashSet<Exception> completeSnapshotWork(ExecutionEngine ee) throws InterruptedException {
HashSet<Exception> retval = new HashSet<Exception>();
m_snapshotTargetTerminators = new ArrayList<Thread>();
LOG.trace("completeSnapshotWork starts at partition :"+ee.getPartitionExecutor().getPartitionId());
while (m_snapshotTableTasks != null) {
Future<?> result = doSnapshotWork(ee);
if (result != null) {
try {
result.get();
} catch (ExecutionException e) {
final boolean added = retval.add((Exception)e.getCause());
assert(added);
} catch (Exception e) {
final boolean added = retval.add((Exception)e.getCause());
assert(added);
}
}
}
/**
* Block until the sync has actually occurred in the forked threads.
* The threads are spawned even in the blocking case to keep it simple.
*/
for (final Thread t : m_snapshotTargetTerminators) {
t.join();
}
m_snapshotTargetTerminators = null;
/**
* Set it to -1 indicating the system is ready to
* perform another snapshot. Changed to wait until all
* the previous snapshot work has finished so that
* snapshot initiation doesn't wait on the file system
*/
synchronized (SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting) {
if(ExecutionSitesCurrentlySnapshotting.get() == 0){
ExecutionSitesCurrentlySnapshotting.set(-1);
LOG.trace("ExecutionSitesCurrentlySnapshotting reset :"+SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get());
}
}
LOG.trace("completeSnapshotWork ends at partition :"+ee.getPartitionExecutor().getPartitionId());
return retval;
}
}