Package org.voltdb

Source Code of org.voltdb.SnapshotDaemon$Snapshot

/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
*/

package org.voltdb;

import java.io.File;
import java.nio.ByteBuffer;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.apache.zookeeper_voltpatches.CreateMode;
import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.KeeperException.NodeExistsException;
import org.apache.zookeeper_voltpatches.WatchedEvent;
import org.apache.zookeeper_voltpatches.Watcher;
import org.apache.zookeeper_voltpatches.Watcher.Event.EventType;
import org.apache.zookeeper_voltpatches.Watcher.Event.KeeperState;
import org.apache.zookeeper_voltpatches.ZooDefs.Ids;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.apache.zookeeper_voltpatches.data.Stat;
import org.json_voltpatches.JSONArray;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.voltcore.logging.Level;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.HostMessenger;
import org.voltcore.messaging.Mailbox;
import org.voltcore.messaging.SiteMailbox;
import org.voltcore.messaging.VoltMessage;
import org.voltcore.network.Connection;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltcore.utils.RateLimitedLogger;
import org.voltcore.zk.ZKUtil;
import org.voltdb.catalog.SnapshotSchedule;
import org.voltdb.client.ClientResponse;
import org.voltdb.client.ProcedureCallback;
import org.voltdb.messaging.SnapshotCheckRequestMessage;
import org.voltdb.messaging.SnapshotCheckResponseMessage;
import org.voltdb.sysprocs.saverestore.SnapshotUtil;
import org.voltdb.utils.VoltTableUtil;

import com.google_voltpatches.common.base.Throwables;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.util.concurrent.Callables;
import com.google_voltpatches.common.util.concurrent.ListenableFuture;
import com.google_voltpatches.common.util.concurrent.ListeningScheduledExecutorService;
import com.google_voltpatches.common.util.concurrent.MoreExecutors;

/**
* A scheduler of automated snapshots and manager of archived and retained snapshots.
* The new functionality for handling truncation snapshots operates separately from
* the old automated snapshots. They just share the same event processing threads. Future work
* should merge them.
*
* Note: comments that start with TRAIL [TruncSnap] TruncationSnapshot legend
*       are way to document code trails which may be perused via the code trails
*       eclipse plugin https://marketplace.eclipse.org/content/code-trails
*/
public class SnapshotDaemon implements SnapshotCompletionInterest {

    private class TruncationSnapshotAttempt {
        private String path;
        private String nonce;
        private boolean finished;
    }

    static int m_periodicWorkInterval = 2000;
    public static volatile int m_userSnapshotRetryInterval = 30;

    /*
     * Something that initiates procedures for the snapshot daemon.
     */
    public interface DaemonInitiator {
        public void initiateSnapshotDaemonWork(final String procedureName, long clientData, Object params[]);
    }

    private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT");
    private static final VoltLogger loggingLog = new VoltLogger("LOGGING");
    private final ScheduledThreadPoolExecutor m_esBase =
            new ScheduledThreadPoolExecutor(1,
                    CoreUtils.getThreadFactory(null, "SnapshotDaemon", CoreUtils.SMALL_STACK_SIZE, false, null),
                                            new java.util.concurrent.ThreadPoolExecutor.DiscardPolicy());
    private final ListeningScheduledExecutorService m_es = MoreExecutors.listeningDecorator(m_esBase);

    private ZooKeeper m_zk;
    private DaemonInitiator m_initiator;
    private long m_nextCallbackHandle;
    private String m_truncationSnapshotPath;

    /*
     * Before doing truncation snapshot operations, wait a few seconds
     * to give a few nodes a chance to get into the same state WRT to truncation
     * so that a truncation snapshot will can service multiple truncation requests
     * that arrive at the same time.
     */
    int m_truncationGatheringPeriod = 10;

    private final TreeMap<Long, TruncationSnapshotAttempt> m_truncationSnapshotAttempts =
        new TreeMap<Long, TruncationSnapshotAttempt>();
    private Future<?> m_truncationSnapshotScanTask;

    private TimeUnit m_frequencyUnit;
    private long m_frequencyInMillis;
    private int m_frequency;
    private int m_retain;
    private String m_path;
    private String m_prefix;
    private String m_prefixAndSeparator;

    private SnapshotSchedule m_lastKnownSchedule = null;

    private final HashMap<Long, ProcedureCallback> m_procedureCallbacks = new HashMap<Long, ProcedureCallback>();

    private final SimpleDateFormat m_dateFormat = new SimpleDateFormat("'_'yyyy.MM.dd.HH.mm.ss");

    // true if this SnapshotDaemon is the one responsible for generating
    // snapshots
    private boolean m_isAutoSnapshotLeader = false;
    private Future<?> m_autoSnapshotTask = null;
    private long m_nextSnapshotTime;

    /**
     * Don't invoke sysprocs too close together.
     * Keep track of the last call and only do it after
     * enough time has passed.
     */
    private long m_lastSysprocInvocation = System.currentTimeMillis();
    static long m_minTimeBetweenSysprocs = 3000;

    /**
     * List of snapshots on disk sorted by creation time
     */
    final LinkedList<Snapshot> m_snapshots = new LinkedList<Snapshot>();

    /**
     * States the daemon can be in
     *
     */
    enum State {
        /*
         * Initial state
         */
        STARTUP,
        /*
         * Invoked @SnapshotScan, waiting for results.
         * Done once on startup to find number of snapshots on disk
         * at path with prefix
         */
        SCANNING,
        /*
         * Waiting in between snapshots
         */
        WAITING,
        /*
         * Deleting snapshots that are no longer going to be retained.
         */
        DELETING,
        /*
         * Initiated a snapshot. Will call snapshot scan occasionally to find out
         * when it completes.
         */
        SNAPSHOTTING,
    }

    private State m_state = State.STARTUP;

    SnapshotDaemon() {
        m_esBase.setContinueExistingPeriodicTasksAfterShutdownPolicy(false);
        m_esBase.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);

        m_frequencyUnit = null;
        m_retain = 0;
        m_frequency = 0;
        m_frequencyInMillis = 0;
        m_prefix = null;
        m_path = null;
        m_prefixAndSeparator = null;



        // Register the snapshot status to the StatsAgent
        SnapshotStatus snapshotStatus = new SnapshotStatus();
        VoltDB.instance().getStatsAgent().registerStatsSource(StatsSelector.SNAPSHOTSTATUS,
                                                              0,
                                                              snapshotStatus);
        VoltDB.instance().getSnapshotCompletionMonitor().addInterest(this);
    }

    public void init(DaemonInitiator initiator, HostMessenger messenger, Runnable threadLocalInit, GlobalServiceElector gse) {
        m_initiator = initiator;
        m_zk = messenger.getZK();
        m_mb = new SiteMailbox(messenger, messenger.getHSIdForLocalSite(HostMessenger.SNAPSHOT_DAEMON_ID));
        messenger.createMailbox(m_mb.getHSId(), m_mb);

        try {
            m_zk.create(VoltZK.nodes_currently_snapshotting, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        } catch (Exception e) {}
        try {
            m_zk.create(VoltZK.completed_snapshots, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        } catch (Exception e) {}

        if (threadLocalInit != null) {
            m_es.execute(threadLocalInit);
        }

        /*
         *  Really shouldn't leak this from a constructor, and twice to boot
         *  If IV2 is enabled leader election for the snapshot daemon is always tied to
         *  leader election for the MP coordinator so that they can't be partitioned
         *  from each other.
         */
        if (gse == null) {
            m_es.execute(new Runnable() {
                @Override
                public void run() {
                    leaderElection();
                }
            });
        } else {
            gse.registerService(new Promotable() {
                @Override
                public void acceptPromotion() throws InterruptedException,
                        ExecutionException, KeeperException {
                    m_es.submit(new Runnable() {
                        @Override
                        public void run() {
                            try {
                                m_isAutoSnapshotLeader = true;
                                if (m_lastKnownSchedule != null) {
                                    makeActivePrivate(m_lastKnownSchedule);
                                }
                                electedTruncationLeader();
                            } catch (Exception e) {
                                VoltDB.crashLocalVoltDB("Exception in snapshot daemon electing master via ZK", true, e);
                            }
                        }
                    });
                }

            });
        }
    }

    private static final long INITIATION_RESPONSE_TIMEOUT_MS = 20 * 60 * 1000;
    // Tracks when the last @SnapshotSave call was issued.
    // Prevents two @SnapshotSave calls being issued back to back.
    // This is reset when a response is received for the initiation.
    private Pair<Long, Boolean> m_lastInitiationTs = null;
    private Mailbox m_mb;
    private void initiateSnapshotSave(final long handle, final Object params[], boolean blocking)
    {
        boolean success = true;
        VoltTable checkResult = SnapshotUtil.constructNodeResultsTable();
        final String jsString = String.class.cast(params[0]);

        if (m_lastInitiationTs != null) {
            final long elapsedMs = System.currentTimeMillis() - m_lastInitiationTs.getFirst();
            // Blocking snapshot may take a long time to finish, don't time it out if it's blocking
            if (!m_lastInitiationTs.getSecond() && elapsedMs > INITIATION_RESPONSE_TIMEOUT_MS) {
                SNAP_LOG.warn(String.format("A snapshot was initiated %d minutes ago and hasn't received a response yet.",
                        TimeUnit.MILLISECONDS.toMinutes(elapsedMs)));
                m_lastInitiationTs = null;
            } else {
                checkResult.addRow(CoreUtils.getHostIdFromHSId(m_mb.getHSId()), CoreUtils.getHostnameOrAddress(), null,
                        "FAILURE", "SNAPSHOT IN PROGRESS");
                success = false;
            }
        }

        if (success) {
            try {
                final JSONObject jsObj = new JSONObject(jsString);
                boolean initiateSnapshot;

                // Do scan work on all known live hosts
                VoltMessage msg = new SnapshotCheckRequestMessage(jsString);
                List<Integer> liveHosts = VoltDB.instance().getHostMessenger().getLiveHostIds();
                for (int hostId : liveHosts) {
                    m_mb.send(CoreUtils.getHSIdFromHostAndSite(hostId, HostMessenger.SNAPSHOT_IO_AGENT_ID), msg);
                }

                // Wait for responses from all hosts for a certain amount of time
                Map<Integer, VoltTable> responses = Maps.newHashMap();
                final long timeoutMs = 10 * 1000; // 10s timeout
                final long endTime = System.currentTimeMillis() + timeoutMs;
                SnapshotCheckResponseMessage response;
                while ((response = (SnapshotCheckResponseMessage) m_mb.recvBlocking(timeoutMs)) != null) {
                    // ignore responses to previous requests
                    if (jsObj.getString("path").equals(response.getPath()) &&
                        jsObj.getString("nonce").equals(response.getNonce())) {
                        responses.put(CoreUtils.getHostIdFromHSId(response.m_sourceHSId), response.getResponse());
                    }

                    if (responses.size() == liveHosts.size() || System.currentTimeMillis() > endTime) {
                        break;
                    }
                }

                if (responses.size() != liveHosts.size()) {
                    checkResult.addRow(CoreUtils.getHostIdFromHSId(m_mb.getHSId()), CoreUtils.getHostnameOrAddress(), null,
                            "FAILURE", "TIMED OUT CHECKING SNAPSHOT FEASIBILITY");
                    success = false;
                }

                if (success) {
                    // TRAIL [TruncSnap:12] all participating nodes have initiated successfully
                    // Call @SnapshotSave if check passed, return the failure otherwise
                    checkResult = VoltTableUtil.unionTables(responses.values());
                    initiateSnapshot = SnapshotUtil.didSnapshotRequestSucceed(new VoltTable[]{checkResult});

                    if (initiateSnapshot) {
                        m_lastInitiationTs = Pair.of(System.currentTimeMillis(), blocking);
                        m_initiator.initiateSnapshotDaemonWork("@SnapshotSave", handle, params);
                    } else {
                        success = false;
                    }
                }
            } catch (JSONException e) {
                success = false;
                checkResult.addRow(CoreUtils.getHostIdFromHSId(m_mb.getHSId()), CoreUtils.getHostnameOrAddress(), null, "FAILURE", "ERROR PARSING JSON");
                SNAP_LOG.warn("Error parsing JSON string: " + jsString, e);
            }
        }

        if (!success) {
            final ClientResponseImpl failureResponse =
                    new ClientResponseImpl(ClientResponseImpl.SUCCESS, new VoltTable[]{checkResult}, null);
            failureResponse.setClientHandle(handle);
            processClientResponse(Callables.returning(failureResponse));
        }
    }

    private void saveResponseToZKAndReset(String requestId, ClientResponseImpl response) throws Exception
    {
        ByteBuffer buf = ByteBuffer.allocate(response.getSerializedSize());
        m_zk.create(VoltZK.user_snapshot_response + requestId,
                response.flattenToBuffer(buf).array(),
                Ids.OPEN_ACL_UNSAFE,
                CreateMode.PERSISTENT);
        userSnapshotRequestExistenceCheck(true);
    }

    /*
     *  Search for truncation snapshots, after a failure there may be
     *  ones we don't know about, there may be ones from a previous instance etc.
     *  Do this every five minutes as an easy hack to make sure we don't leak them.
     *  Next time groom is called it will delete the old ones after a success.
     */
    private void scanTruncationSnapshots() {
        if (m_truncationSnapshotPath == null) {
            try {
                m_truncationSnapshotPath = new String(m_zk.getData(VoltZK.test_scan_path, false, null), "UTF-8");
            } catch (Exception e) {
                return;
            }
        }

        Object params[] = new Object[1];
        params[0] = m_truncationSnapshotPath;
        long handle = m_nextCallbackHandle++;
        m_procedureCallbacks.put(handle, new ProcedureCallback() {

            @Override
            public void clientCallback(final ClientResponse clientResponse)
                    throws Exception {
                if (clientResponse.getStatus() != ClientResponse.SUCCESS){
                    SNAP_LOG.error(clientResponse.getStatusString());
                    return;
                }

                final VoltTable results[] = clientResponse.getResults();
                if (results.length == 1) {
                    final VoltTable result = results[0];
                    boolean advanced = result.advanceRow();
                    assert(advanced);
                    assert(result.getColumnCount() == 1);
                    assert(result.getColumnType(0) == VoltType.STRING);
                    loggingLog.error("Snapshot scan failed with failure response: " + result.getString("ERR_MSG"));
                    return;
                }
                assert(results.length == 3);

                final VoltTable snapshots = results[0];
                assert(snapshots.getColumnCount() == 9);

                TreeMap<Long, TruncationSnapshotAttempt> foundSnapshots =
                    new TreeMap<Long, TruncationSnapshotAttempt>();
                while (snapshots.advanceRow()) {
                    final String path = snapshots.getString("PATH");
                    final String nonce = snapshots.getString("NONCE");
                    final Long txnId = snapshots.getLong("TXNID");
                    TruncationSnapshotAttempt snapshotAttempt = new TruncationSnapshotAttempt();
                    snapshotAttempt.path = path;
                    snapshotAttempt.nonce = nonce;
                    foundSnapshots.put(txnId, snapshotAttempt);
                }

                for (Map.Entry<Long, TruncationSnapshotAttempt> entry : foundSnapshots.entrySet()) {
                    if (!m_truncationSnapshotAttempts.containsKey(entry.getKey())) {
                        loggingLog.info("Truncation snapshot scan discovered new snapshot txnid " + entry.getKey() +
                                " path " + entry.getValue().path + " nonce " + entry.getValue().nonce);
                        m_truncationSnapshotAttempts.put(entry.getKey(), entry.getValue());
                    }
                }
            }

        });
        m_initiator.initiateSnapshotDaemonWork("@SnapshotScan", handle, params);
    }

    /*
     * Delete all snapshots older then the last successful snapshot.
     * This only effects snapshots used for log truncation
     */
    private void groomTruncationSnapshots() {
        ArrayList<TruncationSnapshotAttempt> toDelete = new ArrayList<TruncationSnapshotAttempt>();
        boolean foundMostRecentSuccess = false;
        Iterator<Map.Entry<Long, TruncationSnapshotAttempt>> iter =
            m_truncationSnapshotAttempts.descendingMap().entrySet().iterator();
        loggingLog.info("Snapshot daemon grooming truncation snapshots");
        while (iter.hasNext()) {
            Map.Entry<Long, TruncationSnapshotAttempt> entry = iter.next();
            TruncationSnapshotAttempt snapshotAttempt = entry.getValue();
            if (!foundMostRecentSuccess) {
                if (snapshotAttempt.finished) {
                    loggingLog.info("Found most recent successful snapshot txnid " + entry.getKey()
                            + " path " + entry.getValue().path + " nonce " + entry.getValue().nonce);
                    foundMostRecentSuccess = true;
                } else {
                    loggingLog.info("Retaining possible partial snapshot txnid " + entry.getKey()
                            + " path " + entry.getValue().path + " nonce " + entry.getValue().nonce);
                }
            } else {
                loggingLog.info("Deleting old unecessary snapshot txnid " + entry.getKey()
                        + " path " + entry.getValue().path + " nonce " + entry.getValue().nonce);
                toDelete.add(entry.getValue());
                iter.remove();
            }
        }

        String paths[] = new String[toDelete.size()];
        String nonces[] = new String[toDelete.size()];

        int ii = 0;
        for (TruncationSnapshotAttempt attempt : toDelete) {
            paths[ii] = attempt.path;
            nonces[ii++] = attempt.nonce;
        }

        Object params[] =
            new Object[] {
                paths,
                nonces,
                };
        long handle = m_nextCallbackHandle++;
        m_procedureCallbacks.put(handle, new ProcedureCallback() {

            @Override
            public void clientCallback(ClientResponse clientResponse)
                    throws Exception {
                if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
                    SNAP_LOG.error(clientResponse.getStatusString());
                }
            }

        });
        m_initiator.initiateSnapshotDaemonWork("@SnapshotDelete", handle, params);
    }

    /*
     * If this cluster has per partition transactions ids carried over from
     * previous instances, retrieve them from ZK and pass them to snapshot save so that it can
     * include them in the snapshot
     */
    private JSONArray retrievePerPartitionTransactionIds() {
        JSONArray retval = new JSONArray();
        try {
            ByteBuffer values = ByteBuffer.wrap(m_zk.getData(VoltZK.perPartitionTxnIds, false, null));
            int numKeys = values.getInt();
            for (int ii = 0; ii < numKeys; ii++) {
                retval.put(values.getLong());
            }
        } catch (KeeperException.NoNodeException e) {/*doesn't have to exist*/}
        catch (Exception e) {
            VoltDB.crashLocalVoltDB("Failed to retrieve per partition transaction ids for snapshot", false, e);
        }
        return retval;
    }


    /**
     * Leader election for snapshots.
     * Leader will watch for truncation and user snapshot requests
     */
    private void leaderElection() {
        loggingLog.info("Starting leader election for snapshot truncation daemon");
        try {
            while (true) {
                Stat stat = m_zk.exists(VoltZK.snapshot_truncation_master, new Watcher() {
                    @Override
                    public void process(WatchedEvent event) {
                        switch(event.getType()) {
                        case NodeDeleted:
                            loggingLog.info("Detected the snapshot truncation leader's ephemeral node deletion");
                            m_es.execute(new Runnable() {
                                @Override
                                public void run() {
                                    leaderElection();
                                }
                            });
                            break;
                            default:
                                break;
                        }
                    }
                });
                if (stat == null) {
                    try {
                        m_zk.create(VoltZK.snapshot_truncation_master, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
                        m_isAutoSnapshotLeader = true;
                        if (m_lastKnownSchedule != null) {
                            makeActivePrivate(m_lastKnownSchedule);
                        }
                        electedTruncationLeader();
                        return;
                    } catch (NodeExistsException e) {
                    }
                } else {
                    loggingLog.info("Leader election concluded, a leader already exists");
                    break;
                }
            }
        } catch (Exception e) {
            VoltDB.crashLocalVoltDB("Exception in snapshot daemon electing master via ZK", true, e);
        }
    }

    /*
     * Invoked when this snapshot daemon has been elected as leader
     */
    private void electedTruncationLeader() throws Exception {
        loggingLog.info("This node was selected as the leader for snapshot truncation");
        m_truncationSnapshotScanTask = m_es.scheduleWithFixedDelay(new Runnable() {
            @Override
            public void run() {
                try {
                    scanTruncationSnapshots();
                } catch (Exception e) {
                    loggingLog.error("Error during scan and group of truncation snapshots");
                }
            }
        }, 0, 1, TimeUnit.HOURS);
        try {
            // TRAIL [TruncSnap:1] elected as leader
            truncationRequestExistenceCheck();
            userSnapshotRequestExistenceCheck(false);
        } catch (Exception e) {
            VoltDB.crashLocalVoltDB("Error while accepting snapshot daemon leadership", true, e);
        }
    }

    /*
     * Process the event generated when the node for a truncation request
     * is created, reschedules it for a few seconds later
     */
    private void processTruncationRequestEvent(final WatchedEvent event) {
        if (event.getType() == EventType.NodeChildrenChanged) {
            loggingLog.info("Scheduling truncation request processing 10 seconds from now");
            /*
             * Do it 10 seconds later because these requests tend to come in bunches
             * and we want one truncation snapshot to do truncation for all nodes
             * so we don't get them back to back
             *
             * TRAIL [TruncSnap:5] wait 10 secs to process request
             */
            m_es.schedule(new Runnable() {
                @Override
                public void run() {
                    try {
                        processSnapshotTruncationRequestCreated(event);
                    } catch (Exception e) {
                        VoltDB.crashLocalVoltDB("Error processing snapshot truncation request creation", true, e);
                    }
                }
            }, m_truncationGatheringPeriod, TimeUnit.SECONDS);
            return;
        } else {
            /*
             * We are very careful to cancel the watch if we find that a truncation requests exists. We are
             * the only thread and daemon that should delete the node or change the data and the watch
             * isn't set when that happens because it is part of processing the request and the watch should
             * either be canceled or have already fired.
             */
            VoltDB.crashLocalVoltDB(
                    "Trunction request watcher fired with event type other then created: " + event.getType(),
                    true,
                    null);
        }
    }

    /*
     * A ZK event occured requestion a truncation snapshot be taken
     */
    private void processSnapshotTruncationRequestCreated(
            final WatchedEvent event) {
        loggingLog.info("Snapshot truncation leader received snapshot truncation request");
        String snapshotPathTemp;
        // TRAIL [TruncSnap:6] Get the snapshot path.
        try {
            snapshotPathTemp = new String(m_zk.getData(VoltZK.truncation_snapshot_path, false, null), "UTF-8");
        } catch (Exception e) {
            loggingLog.error("Unable to retrieve truncation snapshot path from ZK, log can't be truncated");
            return;
        }
        // Get the truncation request ID which is the truncation request node path.
        final String truncReqId;
        try {
            List<String> children = m_zk.getChildren(event.getPath(), false);
            if (children.isEmpty()) {
                loggingLog.error("Unable to retrieve truncation snapshot request id from ZK, log can't be truncated");
                return;
            }
            truncReqId = ZKUtil.joinZKPath(event.getPath(), Collections.max(children));

        } catch (Exception e) {
            loggingLog.error("Unable to retrieve truncation snapshot request ID from ZK, log can't be truncated");
            return;
        }
        m_truncationSnapshotPath = snapshotPathTemp;
        final String snapshotPath = snapshotPathTemp;
        final long now = System.currentTimeMillis();
        final String nonce = Long.toString(now);
        //Allow nodes to check and see if the nonce incoming for a snapshot is
        //for a truncation snapshot. In that case they will mark the completion node
        //to be for a truncation snapshot. SnapshotCompletionMonitor notices the mark.
        // TRAIL [TruncSnap:7] write current ts to request zk node data
        try {
            ByteBuffer payload = ByteBuffer.allocate(8);
            payload.putLong(0, now);
            m_zk.setData(VoltZK.request_truncation_snapshot, payload.array(), -1);
        } catch (Exception e) {
            //Cause a cascading failure?
            VoltDB.crashLocalVoltDB("Setting data on the truncation snapshot request in ZK should never fail", true, e);
        }
        // for the snapshot save invocations
        JSONObject jsObj = new JSONObject();
        try {
            String sData = "";
            if (truncReqId != null) {
                JSONObject jsData = new JSONObject();
                jsData.put("truncReqId", truncReqId);
                sData = jsData.toString();
            }
            jsObj.put("path", snapshotPath );
            jsObj.put("nonce", nonce);
            jsObj.put("perPartitionTxnIds", retrievePerPartitionTransactionIds());
            jsObj.put("data", sData);
        } catch (JSONException e) {
            /*
             * Should never happen, so fail fast
             */
            VoltDB.crashLocalVoltDB("", true, e);
        }

        // for the snapshot save invocations
        long handle = m_nextCallbackHandle++;

        // for the snapshot save invocation
        m_procedureCallbacks.put(handle, new ProcedureCallback() {

            @Override
            public void clientCallback(ClientResponse clientResponse)
                    throws Exception {
                m_lastInitiationTs = null;
                if (clientResponse.getStatus() != ClientResponse.SUCCESS){
                    loggingLog.warn(
                            "Attempt to initiate a truncation snapshot was not successful: " +
                            clientResponse.getStatusString());
                    loggingLog.warn("Retrying log truncation snapshot in 5 minutes");
                    /*
                     * TRAIL [TruncSnap:8] (callback) on failed response try again in a few minute
                     */
                    m_es.schedule(new Runnable() {
                        @Override
                        public void run() {
                            try {
                                processTruncationRequestEvent(event);
                            } catch (Exception e) {
                                VoltDB.crashLocalVoltDB("Error processing snapshot truncation request event", true, e);
                            }
                        }
                    }, 5, TimeUnit.MINUTES);
                    return;
                }

                final VoltTable results[] = clientResponse.getResults();
                final VoltTable result = results[0];
                boolean success = true;

                final String err = SnapshotUtil.didSnapshotRequestFailWithErr(results);
                if (err != null) {
                    if (err.trim().equalsIgnoreCase("SNAPSHOT IN PROGRESS")) {
                        loggingLog.info("Snapshot is in progress");
                    } else {
                        loggingLog.warn("Snapshot failed with failure response: " + err);
                    }
                    success = false;
                }

                //assert(result.getColumnName(1).equals("TABLE"));
                if (success) {
                    while (result.advanceRow()) {
                        if (!result.getString("RESULT").equals("SUCCESS")) {
                            success = false;
                            loggingLog.warn("Snapshot save feasibility test failed for host "
                                    + result.getLong("HOST_ID") + " table " + result.getString("TABLE") +
                                    " with error message " + result.getString("ERR_MSG"));
                        }
                    }
                }

                if (success) {
                    loggingLog.info("Snapshot initiation for log truncation was successful");

                    JSONObject obj = new JSONObject(clientResponse.getAppStatusString());
                    final long snapshotTxnId = Long.valueOf(obj.getLong("txnId"));
                    try {
                        boolean found = false;
                        ZKUtil.VoidCallback lastCallback = null;

                        for (String child: m_zk.getChildren(event.getPath(),false)) {
                            String requestId = ZKUtil.joinZKPath(event.getPath(), child);
                            found = found || requestId.equals(truncReqId);

                            lastCallback = new ZKUtil.VoidCallback();
                            m_zk.delete(requestId, -1, lastCallback, null);
                        }

                        if (lastCallback != null) {
                            try {
                                lastCallback.get();
                            } catch (KeeperException.NoNodeException ignoreIt) {
                            }
                        }
                        if (!found) {
                            VoltDB.crashLocalVoltDB(
                                    "Could not match truncations snapshot request id while atepting its removal", true, null);
                        }
                    } catch (Exception e) {
                        VoltDB.crashLocalVoltDB(
                                "Unexpected error deleting truncation snapshot request", true, e);
                    }

                    try {
                        TruncationSnapshotAttempt snapshotAttempt =
                            m_truncationSnapshotAttempts.get(snapshotTxnId);
                        if (snapshotAttempt == null) {
                            snapshotAttempt = new TruncationSnapshotAttempt();
                            m_truncationSnapshotAttempts.put(snapshotTxnId, snapshotAttempt);
                        }
                        snapshotAttempt.nonce = nonce;
                        snapshotAttempt.path = snapshotPath;
                    } finally {
                        // TRAIL [TruncSnap:9] (callback) restart the whole request check cycle
                        try {
                            truncationRequestExistenceCheck();
                        } catch (Exception e) {
                            VoltDB.crashLocalVoltDB(
                                    "Unexpected error checking for existence of truncation snapshot request"
                                    , true, e);
                        }
                    }
                } else {
                    loggingLog.info("Retrying log truncation snapshot in 60 seconds");
                    /*
                     * TRAIL [TruncSnap:10] (callback) on table reported failure try again in a few minutes
                     */
                    m_es.schedule(new Runnable() {
                        @Override
                        public void run() {
                            try {
                                processTruncationRequestEvent(event);
                            } catch (Exception e) {
                                VoltDB.crashLocalVoltDB("Exception processing truncation request event", true, e);
                            }
                        }
                    }, 1, TimeUnit.MINUTES);
                }
            }

        });
        try {
            loggingLog.info("Initiating @SnapshotSave for log truncation");
            initiateSnapshotSave(handle, new Object[]{jsObj.toString(4)}, false);
        } catch (JSONException e) {
            /*
             * Should never happen, so fail fast
             */
            VoltDB.crashLocalVoltDB("", true, e);
        }
        return;
    }

    private TruncationRequestExistenceWatcher m_currentTruncationWatcher = new TruncationRequestExistenceWatcher();
    /*
     * Watcher that handles changes to the ZK node for
     * internal truncation snapshot requests
     */
    private class TruncationRequestExistenceWatcher extends ZKUtil.CancellableWatcher {

        public TruncationRequestExistenceWatcher() {
            super(m_es);
        }

        @Override
        public void pProcess(final WatchedEvent event) {
            if (event.getState() == KeeperState.Disconnected) return;
            try {
                // TRAIL [TruncSnap:4] watch event on zk node fires
                processTruncationRequestEvent(event);
            } catch (Exception e) {
                VoltDB.crashLocalVoltDB("Error procesing truncation request event", true, e);
            }
        }
    };

    /*
     * Watcher that handles events to the user snapshot request node
     * in ZK
     */
    private final Watcher m_userSnapshotRequestExistenceWatcher = new Watcher() {

        @Override
        public void process(final WatchedEvent event) {
            if (event.getState() == KeeperState.Disconnected) return;

            m_es.execute(new Runnable() {
                @Override
                public void run() {
                    try {
                        processUserSnapshotRequestEvent(event);
                    } catch (Exception e) {
                        VoltDB.crashLocalVoltDB("Error processing user snapshot request event", true, e);
                    }
                }
            });
        }
    };

    /*
     * Process the event generated when the node for a user snapshot request
     * is created.
     */
    private void processUserSnapshotRequestEvent(final WatchedEvent event) throws Exception {
        if (event.getType() == EventType.NodeCreated) {
            byte data[] = m_zk.getData(event.getPath(), false, null);
            String jsonString = new String(data, "UTF-8");
            final JSONObject jsObj = new JSONObject(jsonString);
            final String requestId = jsObj.getString("requestId");
            final boolean blocking = jsObj.getBoolean("block");
            /*
             * Going to reuse the request object, remove the requestId
             * field now that it is consumed
             */
            jsObj.remove("requestId");
            jsObj.put("perPartitionTxnIds", retrievePerPartitionTransactionIds());
            final long handle = m_nextCallbackHandle++;
            m_procedureCallbacks.put(handle, new ProcedureCallback() {

                @Override
                public void clientCallback(ClientResponse clientResponse) {
                    m_lastInitiationTs = null;
                    try {
                        /*
                         * If there is an error then we are done.
                         */
                        if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
                            ClientResponseImpl rimpl = (ClientResponseImpl)clientResponse;
                            saveResponseToZKAndReset(requestId, rimpl);
                            return;
                        }

                        /*
                         * Now analyze the response. If a snapshot was in progress
                         * we have to reattempt it later, and send a response to the client
                         * saying it was queued. Otherwise, forward the response
                         * failure/success to the client.
                         */
                        if (isSnapshotInProgressResponse(clientResponse)) {
                            scheduleSnapshotForLater(jsObj.toString(4), requestId, true);
                        } else {
                            ClientResponseImpl rimpl = (ClientResponseImpl)clientResponse;
                            saveResponseToZKAndReset(requestId, rimpl);
                            return;
                        }
                    } catch (Exception e) {
                        SNAP_LOG.error("Error processing user snapshot request", e);
                        try {
                            userSnapshotRequestExistenceCheck(true);
                        } catch (Exception e2) {
                            VoltDB.crashLocalVoltDB("Error resetting watch for user snapshots", true, e2);
                        }
                    }
                }
            });
            initiateSnapshotSave(handle, new Object[]{jsObj.toString(4)}, blocking);
            return;
        }
    }

    /*
     * Schedule a user snapshot request for later since the database was busy.
     * Continue doing this as long as the error response returned by the DB is snapshot in progress.
     * Since the snapshot is being scheduled for later we will send an immediate response to the client
     * via ZK relay.
     */
    private void scheduleSnapshotForLater(
            final String requestObj,
            final String requestId,
            final boolean isFirstAttempt
            ) throws Exception {
        /*
         * Only need to send the queue response the first time we attempt to schedule the snapshot
         * for later. It may be necessary to reschedule via this function multiple times.
         */
        if (isFirstAttempt) {
            SNAP_LOG.info("A user snapshot request could not be immediately fulfilled and will be reattempted later");
            /*
             * Construct a result to send to the client right now via ZK
             * saying we queued it to run later
             */
            VoltTable result = SnapshotUtil.constructNodeResultsTable();
            result.addRow(-1,
                    CoreUtils.getHostnameOrAddress(),
                    "",
                    "SUCCESS",
                    "SNAPSHOT REQUEST QUEUED");
            final ClientResponseImpl queuedResponse =
                new ClientResponseImpl(ClientResponseImpl.SUCCESS,
                                       new VoltTable[] { result },
                                       "Snapshot request could not be fulfilled because a snapshot " +
                                         "is in progress. It was queued for execution",
                                       0);
            ByteBuffer buf = ByteBuffer.allocate(queuedResponse.getSerializedSize());
            m_zk.create(VoltZK.user_snapshot_response + requestId,
                        queuedResponse.flattenToBuffer(buf).array(),
                        Ids.OPEN_ACL_UNSAFE,
                        CreateMode.PERSISTENT);
        }

        /*
         * Now queue the request for later
         */
        final Runnable r = new Runnable() {
            @Override
            public void run() {
                try {
                    /*
                     * Construct a callback to handle the response to the
                     * @SnapshotSave invocation that will reattempt the user snapshot
                     */
                    final long handle = m_nextCallbackHandle++;
                    m_procedureCallbacks.put(handle, new ProcedureCallback() {
                        @Override
                        public void clientCallback(ClientResponse clientResponse) {
                            m_lastInitiationTs = null;
                            try {
                                /*
                                 * If there is an error then we are done
                                 * attempting this user snapshot. The params must be bad
                                 * or things are broken.
                                 */
                                if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
                                    SNAP_LOG.error(clientResponse.getStatusString());
                                    //Reset the watch, in case this is recoverable
                                    userSnapshotRequestExistenceCheck(true);
                                    return;
                                }

                                VoltTable results[] = clientResponse.getResults();
                                //Do this check to avoid an NPE
                                if (results == null || results.length == 0 || results[0].getRowCount() < 1) {
                                    SNAP_LOG.error("Queued user snapshot request reattempt received an unexpected response" +
                                            " and will not be reattempted. The client response is (status: " +
                                            clientResponse.getStatus() + " " + clientResponse.getStatusString() +
                                            " result: " + (results != null && results.length > 0 ? results[0] : "null") +
                                            ")");
                                    /*
                                     * Don't think this should happen, reset the watch to allow later requests
                                     */
                                    userSnapshotRequestExistenceCheck(true);
                                    return;
                                }

                                VoltTable result = results[0];
                                boolean snapshotInProgress = false;
                                boolean haveFailure = false;
                                while (result.advanceRow()) {
                                    if (result.getString("RESULT").equals("FAILURE")) {
                                        if (result.getString("ERR_MSG").equals("SNAPSHOT IN PROGRESS")) {
                                            snapshotInProgress = true;
                                        } else {
                                            haveFailure = true;
                                        }
                                    }
                                }

                                /*
                                 * If a snapshot was in progress, reattempt later, otherwise,
                                 * if there was a failure, abort the attempt and log.
                                 */
                                if (snapshotInProgress) {
                                    SNAP_LOG.info("Queued user snapshot was reattempted, but a snapshot was " +
                                            " still in progress. It will be reattempted.");
                                    //Turtles all the way down
                                    scheduleSnapshotForLater(
                                            requestObj,
                                            requestId,
                                            false);
                                } else if (haveFailure) {
                                    SNAP_LOG.info("Queued user snapshot was attempted, but there was a failure.");
                                    try {
                                        ClientResponseImpl rimpl = (ClientResponseImpl)clientResponse;
                                        saveResponseToZKAndReset(requestId, rimpl);
                                    }
                                    catch (NodeExistsException e) {
                                        // used to pass null as request ID to avoid this check if the request ID
                                        // already existed, this gives us the same behavior with a pre-existing
                                        // request ID
                                    }
                                    //Log the details of the failure, after resetting the watch in case of some odd NPE
                                    result.resetRowPosition();
                                    SNAP_LOG.info(result);
                                } else {
                                    try {
                                        SNAP_LOG.debug("Queued user snapshot was successfully requested, saving to path " +
                                                VoltZK.user_snapshot_response + requestId);
                                        /*
                                         * Snapshot was started no problem, reset the watch for new requests
                                         */
                                        ClientResponseImpl rimpl = (ClientResponseImpl)clientResponse;
                                        saveResponseToZKAndReset(requestId, rimpl);
                                    }
                                    catch (NodeExistsException e) {
                                        // used to pass null as request ID to avoid this check if the request ID
                                        // already existed, this gives us the same behavior with a pre-existing
                                        // request ID
                                    }
                                    return;
                                }
                            } catch (Exception e) {
                                SNAP_LOG.error("Error processing procedure callback for user snapshot", e);
                                try {
                                    userSnapshotRequestExistenceCheck(true);
                                } catch (Exception e1) {
                                    VoltDB.crashLocalVoltDB(
                                            "Error resetting watch for user snapshot requests", true, e1);
                                }
                            }
                        }
                    });

                    initiateSnapshotSave(handle, new Object[]{requestObj}, false);
                } catch (Exception e) {
                    try {
                        userSnapshotRequestExistenceCheck(true);
                    } catch (Exception e1) {
                        VoltDB.crashLocalVoltDB("Error checking for existence of user snapshots", true, e1);
                    }
                }
            }
        };
        m_es.schedule(r, m_userSnapshotRetryInterval, TimeUnit.SECONDS);
    }

    /*
     * Check a client response to and determine if it is a snapshot in progress response
     * to a snapshot request
     */
    private boolean isSnapshotInProgressResponse(
            ClientResponse response) {
        if (response.getStatus() != ClientResponse.SUCCESS) {
            return false;
        }

        if (response.getResults() == null) {
            return false;
        }

        if (response.getResults().length < 1) {
            return false;
        }

        VoltTable results = response.getResults()[0];
        if (results.getRowCount() < 1) {
            return false;
        }

        boolean snapshotInProgress = false;
        while (results.advanceRow()) {
            if (results.getString("RESULT").equals("FAILURE")) {
                if (results.getString("ERR_MSG").equals("SNAPSHOT IN PROGRESS")) {
                    snapshotInProgress = true;
                }
            }
        }
        return snapshotInProgress;
    }

    /*
     * Set the watch in ZK on the node that represents an internal request
     * for a truncation snapshot
     */
    void truncationRequestExistenceCheck() throws KeeperException, InterruptedException {
        loggingLog.info("Checking for existence of snapshot truncation request");
        m_currentTruncationWatcher.cancel();
        m_currentTruncationWatcher = new TruncationRequestExistenceWatcher();
        // TRAIL [TruncSnap:2] checking for zk node existence
        List<String> requests = m_zk.getChildren(VoltZK.request_truncation_snapshot, m_currentTruncationWatcher);
        if (!requests.isEmpty()) {
            loggingLog.info("A truncation request node already existed, processing truncation request event");
            m_currentTruncationWatcher.cancel();
            // TRAIL [TruncSnap:3] fake a node created event (req ZK node already there)
            processTruncationRequestEvent(new WatchedEvent(
                    EventType.NodeChildrenChanged,
                    KeeperState.SyncConnected,
                    VoltZK.request_truncation_snapshot));
        }
    }

    /*
     * Set the watch in ZK on the node that represents a user
     * request for a snapshot
     */
    void userSnapshotRequestExistenceCheck(boolean deleteExistingRequest) throws Exception {
        if (deleteExistingRequest) {
            m_zk.delete(VoltZK.user_snapshot_request, -1, null, null);
        }
        if (m_zk.exists(VoltZK.user_snapshot_request, m_userSnapshotRequestExistenceWatcher) != null) {
            processUserSnapshotRequestEvent(new WatchedEvent(
                    EventType.NodeCreated,
                    KeeperState.SyncConnected,
                    VoltZK.user_snapshot_request));
        }
    }

    /**
     * Make this SnapshotDaemon responsible for generating snapshots
     */
    public ListenableFuture<Void> mayGoActiveOrInactive(final SnapshotSchedule schedule)
    {
        return m_es.submit(new Callable<Void>() {
            @Override
            public Void call() throws Exception {
                makeActivePrivate(schedule);
                return null;
            }
        });
    }

    private void makeActivePrivate(final SnapshotSchedule schedule) {
        m_lastKnownSchedule = schedule;
        if (schedule.getEnabled()) {
            m_frequency = schedule.getFrequencyvalue();
            m_retain = schedule.getRetain();
            m_path = schedule.getPath();
            m_prefix = schedule.getPrefix();
            m_prefixAndSeparator = m_prefix + "_";
            final String frequencyUnitString = schedule.getFrequencyunit().toLowerCase();
            assert(frequencyUnitString.length() == 1);
            final char frequencyUnit = frequencyUnitString.charAt(0);

            switch (frequencyUnit) {
            case 's':
                m_frequencyUnit = TimeUnit.SECONDS;
                break;
            case 'm':
                m_frequencyUnit = TimeUnit.MINUTES;
                break;
            case 'h':
                m_frequencyUnit = TimeUnit.HOURS;
                break;
                default:
                    throw new RuntimeException("Frequency unit " + frequencyUnitString + "" +
                            " in snapshot schedule is not one of d,m,h");
            }
            m_frequencyInMillis = TimeUnit.MILLISECONDS.convert( m_frequency, m_frequencyUnit);
            m_nextSnapshotTime = System.currentTimeMillis() + m_frequencyInMillis;
        }

        if (m_isAutoSnapshotLeader) {
            if (schedule.getEnabled()) {
                if (m_autoSnapshotTask == null) {
                    m_autoSnapshotTask = m_es.scheduleAtFixedRate(new Runnable() {
                        @Override
                        public void run() {
                            try {
                                doPeriodicWork(System.currentTimeMillis());
                            } catch (Exception e) {
                                SNAP_LOG.warn("Error doing periodic snapshot management work", e);
                            }
                        }
                    }, 0, m_periodicWorkInterval, TimeUnit.MILLISECONDS);
                }
            } else {
                if (m_autoSnapshotTask != null) {
                    m_autoSnapshotTask.cancel(false);
                    m_autoSnapshotTask = null;
                }
            }
        }
    }

    public void makeInactive() {
        m_es.execute(new Runnable() {
            @Override
            public void run() {

                m_snapshots.clear();
            }
        });
    }

    private class Snapshot implements Comparable<Snapshot> {
        private final String path;
        private final String nonce;
        private final Long txnId;

        private Snapshot (String path, String nonce, Long txnId) {
            this.path = path;
            this.nonce = nonce;
            this.txnId = txnId;
        }

        @Override
        public int compareTo(Snapshot o) {
            return txnId.compareTo(o.txnId);
        }

        @Override
        public String toString() {
            return path + "/" + nonce;
        }
    }

    /**
     * Invoked by the client interface occasionally. Returns null
     * if nothing needs to be done or the name of a sysproc along with procedure parameters
     * if there is work to be done. Responses come back later via invocations
     * of processClientResponse
     * @param now Current time
     * @return null if there is no work to do or a sysproc with parameters if there is work
     */
    private void doPeriodicWork(final long now) {
        if (m_lastKnownSchedule == null)
        {
            setState(State.STARTUP);
            return;
        }

        if (m_frequencyUnit == null) {
            return;
        }

        if (m_state == State.STARTUP) {
            initiateSnapshotScan();
        } else if (m_state == State.SCANNING) {
            RateLimitedLogger.tryLogForMessage(System.nanoTime(), 5, TimeUnit.MINUTES, SNAP_LOG, Level.INFO, "Blocked in scanning");
            return;
        } else if (m_state == State.WAITING){
            processWaitingPeriodicWork(now);
        } else if (m_state == State.SNAPSHOTTING) {
            return;
        } else if (m_state == State.DELETING){
            return;
        }
    }

    /**
     * Do periodic work when the daemon is in the waiting state. The
     * daemon paces out sysproc invocations over time
     * to avoid disrupting regular work. If the time for the next
     * snapshot has passed it attempts to initiate a new snapshot.
     * If there are too many snapshots being retains it attempts to delete
     * the extras. Then it attempts to initiate a new snapshot if
     * one is due
     */
    private void processWaitingPeriodicWork(long now) {
        if (now - m_lastSysprocInvocation < m_minTimeBetweenSysprocs) {
            return;
        }

        if (m_snapshots.size() > m_retain) {
            //Quick hack to make sure we don't delete while the snapshot is running.
            //Deletes work really badly during a snapshot because the FS is occupied
            if (!SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.isEmpty()) {
                m_lastSysprocInvocation = System.currentTimeMillis() + 3000;
                return;
            }
            deleteExtraSnapshots();
            return;
        }

        if (m_nextSnapshotTime < now) {
            initiateNextSnapshot(now);
            return;
        }
    }

    private void initiateNextSnapshot(long now) {
        setState(State.SNAPSHOTTING);
        m_lastSysprocInvocation = now;
        final Date nowDate = new Date(now);
        final String dateString = m_dateFormat.format(nowDate);
        final String nonce = m_prefix + dateString;
        JSONObject jsObj = new JSONObject();
        try {
            jsObj.put("path", m_path);
            jsObj.put("nonce", nonce);
            jsObj.put("perPartitionTxnIds", retrievePerPartitionTransactionIds());
            m_snapshots.offer(new Snapshot(m_path, nonce, now));
            long handle = m_nextCallbackHandle++;
            m_procedureCallbacks.put(handle, new ProcedureCallback() {

                @Override
                public void clientCallback(final ClientResponse clientResponse)
                        throws Exception
                {
                    m_lastInitiationTs = null;
                    processClientResponsePrivate(clientResponse);
                }

            });
            SNAP_LOG.info("Requesting auto snapshot to path " + m_path + " nonce " + nonce);
            initiateSnapshotSave(handle, new Object[]{jsObj.toString(4)}, false);
        } catch (JSONException e) {
            /*
             * Should never happen, so fail fast
             */
            VoltDB.crashLocalVoltDB("", false, e);
        }
    }

    /**
     * Invoke the @SnapshotScan system procedure to discover
     * snapshots on disk that are managed by this daemon
     * @return
     */
    private void initiateSnapshotScan() {
        m_lastSysprocInvocation = System.currentTimeMillis();
        Object params[] = new Object[1];
        params[0] = m_path;
        setState(State.SCANNING);
        long handle = m_nextCallbackHandle++;
        m_procedureCallbacks.put(handle, new ProcedureCallback() {

            @Override
            public void clientCallback(final ClientResponse clientResponse)
                    throws Exception {
                processClientResponsePrivate(clientResponse);
            }

        });
        SNAP_LOG.info("Initiating snapshot scan of " + m_path);
        m_initiator.initiateSnapshotDaemonWork("@SnapshotScan", handle, params);
    }

    /**
     * Process responses to sysproc invocations generated by this daemon
     * via processPeriodicWork
     * @param response
     * @return
     */
    public Future<Void> processClientResponse(final Callable<ClientResponseImpl> response) {
        return m_es.submit(new Callable<Void>() {
            @Override
            public Void call() throws Exception {
                try {
                    ClientResponseImpl resp = response.call();
                    long handle = resp.getClientHandle();
                    m_procedureCallbacks.remove(handle).clientCallback(resp);
                } catch (Exception e) {
                    SNAP_LOG.warn("Error when SnapshotDaemon invoked callback for a procedure invocation", e);
                    /*
                     * Don't think it is productive to propagate any exceptions here, Ideally
                     * they should be handled by the procedure callbacks
                     */
                }
                return null;
            }
        });
    }

    private void processClientResponsePrivate(ClientResponse response) {
        if (m_frequencyUnit == null) {
            throw new RuntimeException("SnapshotDaemon received a response when it has not been configured to run");
        }

        if (m_state == State.STARTUP) {
            throw new RuntimeException("SnapshotDaemon received a response in the startup state");
        } else if (m_state == State.SCANNING) {
            processScanResponse(response);
        } else if (m_state == State.DELETING){
            processDeleteResponse(response);
            return;
        } else if (m_state == State.SNAPSHOTTING){
            processSnapshotResponse(response);
            return;
        }
    }

    /**
     * Confirm and log that the snapshot was a success
     * @param response
     */
    private void processSnapshotResponse(ClientResponse response) {
        setState(State.WAITING);
        final long now = System.currentTimeMillis();
        m_nextSnapshotTime += m_frequencyInMillis;
        if (m_nextSnapshotTime < now) {
            m_nextSnapshotTime = now - 1;
        }

        if (response.getStatus() != ClientResponse.SUCCESS){
            logFailureResponse("Snapshot failed", response);
            return;
        }

        final VoltTable results[] = response.getResults();
        final VoltTable result = results[0];

        final String err = SnapshotUtil.didSnapshotRequestFailWithErr(results);
        if (err != null) {
            SNAP_LOG.warn("Snapshot failed with failure response: " +  err);
            m_snapshots.removeLast();
            return;
        }

        //assert(result.getColumnName(1).equals("TABLE"));
        boolean success = true;
        while (result.advanceRow()) {
            if (!result.getString("RESULT").equals("SUCCESS")) {
                success = false;
                SNAP_LOG.warn("Snapshot save feasibility test failed for host "
                        + result.getLong("HOST_ID") + " table " + result.getString("TABLE") +
                        " with error message " + result.getString("ERR_MSG"));
            }
        }
        if (!success) {
            m_snapshots.removeLast();
        }
    }

    /**
     * Process a response to a request to delete snapshots.
     * Always transitions to the waiting state even if the delete
     * fails. This ensures the system will continue to snapshot
     * until the disk is full in the event that there is an administration
     * error or a bug.
     * @param response
     */
    private void processDeleteResponse(ClientResponse response) {
        //Continue snapshotting even if a delete fails.
        setState(State.WAITING);
        if (response.getStatus() != ClientResponse.SUCCESS){
            logFailureResponse("Delete of snapshots failed", response);
            return;
        }

        final VoltTable results[] = response.getResults();
        final String err = SnapshotUtil.didSnapshotRequestFailWithErr(results);
        if (err != null) {
            SNAP_LOG.warn("Snapshot delete failed with failure response: " + err);
            return;
        }
    }

    /**
     * Process the response to a snapshot scan. Find the snapshots
     * that are managed by this daemon by path and nonce
     * and add it the list. Initiate a delete of any that should
     * not be retained
     * @param response
     * @return
     */
    private void processScanResponse(ClientResponse response) {
        setState(State.WAITING);
        if (response.getStatus() != ClientResponse.SUCCESS) {
            logFailureResponse("Initial snapshot scan failed", response);
            return;
        }

        final VoltTable results[] = response.getResults();
        if (results.length == 1) {
            final VoltTable result = results[0];
            boolean advanced = result.advanceRow();
            assert(advanced);
            assert(result.getColumnCount() == 1);
            assert(result.getColumnType(0) == VoltType.STRING);
            SNAP_LOG.warn("Initial snapshot scan failed with failure response: " + result.getString("ERR_MSG"));
            return;
        }
        assert(results.length == 3);

        final VoltTable snapshots = results[0];
        assert(snapshots.getColumnCount() == 9);

        final File myPath = new File(m_path);
        while (snapshots.advanceRow()) {
            final String path = snapshots.getString("PATH");
            final File pathFile = new File(path);
            if (pathFile.equals(myPath)) {
                final String nonce = snapshots.getString("NONCE");
                if (nonce.startsWith(m_prefixAndSeparator)) {
                    final Long txnId = snapshots.getLong("TXNID");
                    m_snapshots.add(new Snapshot(path, nonce, txnId));
                }
            }
        }

        java.util.Collections.sort(m_snapshots);

        deleteExtraSnapshots();
    }

    /**
     * Check if there are extra snapshots and initiate deletion
     * @return
     */
    private void deleteExtraSnapshots() {
        if (m_snapshots.size() <= m_retain) {
            setState(State.WAITING);
        } else {
            m_lastSysprocInvocation = System.currentTimeMillis();
            setState(State.DELETING);
            final int numberToDelete = m_snapshots.size() - m_retain;
            String pathsToDelete[] = new String[numberToDelete];
            String noncesToDelete[] = new String[numberToDelete];
            for (int ii = 0; ii < numberToDelete; ii++) {
                final Snapshot s = m_snapshots.poll();
                pathsToDelete[ii] = s.path;
                noncesToDelete[ii] = s.nonce;
                SNAP_LOG.info("Snapshot daemon deleting " + s.nonce);
            }
            Object params[] =
                new Object[] {
                    pathsToDelete,
                    noncesToDelete,
                    };
            long handle = m_nextCallbackHandle++;
            m_procedureCallbacks.put(handle, new ProcedureCallback() {

                @Override
                public void clientCallback(final ClientResponse clientResponse)
                        throws Exception {
                    processClientResponsePrivate(clientResponse);
                }

            });
            m_initiator.initiateSnapshotDaemonWork("@SnapshotDelete", handle, params);
        }
    }

    private void logFailureResponse(String message, ClientResponse response) {
        SNAP_LOG.warn(message + "\n" + response.getStatusString());
        if (response.getStatusString() != null) {
            SNAP_LOG.warn(response.getStatusString());
        }
    }

    State getState() {
        return m_state;
    }

    void setState(State state) {
        m_state = state;
    }

    public void shutdown() throws InterruptedException {
        if (m_truncationSnapshotScanTask != null) {
            m_truncationSnapshotScanTask.cancel(false);
        }

        m_es.shutdown();
        m_es.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
    }

    /*
     * If we are the leader, go ahead an create the procedure invocation to do the work.
     * We aren't going to journal this in ZK. if the leader dies there will be no
     * one to try and complete the work. C'est la vie.
     */
    public void requestUserSnapshot(final StoredProcedureInvocation invocation, final Connection c) {
        m_es.submit(new Runnable() {
            @Override
            public void run() {
                try {
                    submitUserSnapshotRequest(invocation, c);
                } catch (Exception e) {
                    VoltDB.crashLocalVoltDB("Exception submitting user snapshot request", true, e);
                }
            }
        });
    }

    public static final class ForwardClientException extends Exception {
        /**
         *
         */
        private static final long serialVersionUID = 1L;
        private final VoltTable m_table;
        public ForwardClientException(String msg, VoltTable table) {
            super(msg);
            m_table = table;
        }

    }

    private void submitUserSnapshotRequest(final StoredProcedureInvocation invocation, final Connection c) {
        Object params[] = invocation.getParams().toArray();

        try {
            /*
             * Dang it, have to parse the params here to validate
             */
            SnapshotInitiationInfo snapInfo = new SnapshotInitiationInfo(params);

            createAndWatchRequestNode(invocation.clientHandle, c, snapInfo,
                    false);
        } catch (Exception e) {
            VoltTable tables[] = new VoltTable[0];
            byte status = ClientResponseImpl.GRACEFUL_FAILURE;
            if (e instanceof ForwardClientException && ((ForwardClientException)e).m_table != null) {
                tables = new VoltTable[] { ((ForwardClientException)e).m_table };
                status = ClientResponseImpl.SUCCESS;
            }
            final ClientResponseImpl errorResponse =
                    new ClientResponseImpl(status,
                                           tables,
                                           Throwables.getStackTraceAsString(e),
                                           invocation.clientHandle);
            ByteBuffer buf = ByteBuffer.allocate(errorResponse.getSerializedSize() + 4);
            buf.putInt(buf.capacity() - 4);
            errorResponse.flattenToBuffer(buf).flip();
            c.writeStream().enqueue(buf);
            return;
        }
    }

    /**
     * Try to create the ZK request node and watch it if created successfully.
     */
    public void createAndWatchRequestNode(final long clientHandle,
                                          final Connection c,
                                          SnapshotInitiationInfo snapInfo,
                                          boolean notifyChanges) throws ForwardClientException {
        boolean requestExists = false;
        final String requestId = createRequestNode(snapInfo);
        if (requestId == null) {
            requestExists = true;
        } else {
            if (!snapInfo.isTruncationRequest()) {
                try {
                    registerUserSnapshotResponseWatch(requestId, clientHandle, c, notifyChanges);
                } catch (Exception e) {
                    VoltDB.crashLocalVoltDB("Failed to register ZK watch on snapshot response", true, e);
                }
            }
            else {
                // need to construct a success response of some sort here to indicate the truncation attempt
                // was successfully attempted
                VoltTable result = SnapshotUtil.constructNodeResultsTable();
                result.addRow(-1,
                        CoreUtils.getHostnameOrAddress(),
                        "",
                        "SUCCESS",
                        "SNAPSHOT REQUEST QUEUED");
                final ClientResponseImpl resp =
                    new ClientResponseImpl(ClientResponseImpl.SUCCESS,
                            new VoltTable[] {result},
                            "User-requested truncation snapshot successfully queued for execution.",
                            clientHandle);
                ByteBuffer buf = ByteBuffer.allocate(resp.getSerializedSize() + 4);
                buf.putInt(buf.capacity() - 4);
                resp.flattenToBuffer(buf).flip();
                c.writeStream().enqueue(buf);
            }
        }

        if (requestExists) {
            VoltTable result = SnapshotUtil.constructNodeResultsTable();
            result.addRow(-1,
                    CoreUtils.getHostnameOrAddress(),
                    "",
                    "FAILURE",
                    "SNAPSHOT IN PROGRESS");
            throw new ForwardClientException("A request to perform a user snapshot already exists", result);
        }
    }

    /**
     * Try to create the ZK node to request the snapshot.
     *
     * @param snapInfo SnapshotInitiationInfo object with the requested snapshot initiation settings
     * @return The request ID if succeeded, otherwise null.
     */
    private String createRequestNode(SnapshotInitiationInfo snapInfo)
    {
        String requestId = null;

        try {
            requestId = java.util.UUID.randomUUID().toString();
            if (!snapInfo.isTruncationRequest()) {
                final JSONObject jsObj = snapInfo.getJSONObjectForZK();
                jsObj.put("requestId", requestId);
                String zkString = jsObj.toString(4);
                byte zkBytes[] = zkString.getBytes("UTF-8");

                m_zk.create(VoltZK.user_snapshot_request, zkBytes, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            }
            else {
                m_zk.create(VoltZK.request_truncation_snapshot_node, null,
                        Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT_SEQUENTIAL);
            }
        } catch (KeeperException.NodeExistsException e) {
            return null;
        } catch (Exception e) {
            VoltDB.crashLocalVoltDB("Exception while attempting to create user snapshot request in ZK", true, e);
        }

        return requestId;
    }

    private void registerUserSnapshotResponseWatch(
            final String requestId,
            final long clientHandle,
            final Connection c,
            final boolean notifyChanges) throws Exception {
        final String responseNode = VoltZK.user_snapshot_response + requestId;
        Stat exists = m_zk.exists(responseNode, new Watcher() {
            @Override
            public void process(final WatchedEvent event) {
                if (event.getState() == KeeperState.Disconnected) return;
                switch (event.getType()) {
                case NodeCreated:
                    m_es.submit(new Runnable() {
                        @Override
                        public void run() {
                            try {
                                processUserSnapshotRequestResponse(
                                            event,
                                            clientHandle,
                                            c,
                                            notifyChanges);
                            } catch (Exception e) {
                                VoltDB.crashLocalVoltDB(
                                        "Error retrieving user snapshot request response from ZK",
                                        true,
                                        e);
                            }
                        }
                    });
                    break;
                default:
                }
            }
        });

        if (exists != null) {
            processUserSnapshotRequestResponse(
                    new WatchedEvent(
                        EventType.NodeCreated,
                        KeeperState.SyncConnected,
                        responseNode),
                        clientHandle,
                        c,
                        notifyChanges);
        }
    }

    void processUserSnapshotRequestResponse(
            final WatchedEvent event,
            final long clientHandle,
            final Connection c,
            final boolean notifyChanges) throws Exception {
        byte responseBytes[] = m_zk.getData(event.getPath(), false, null);
        try {
            m_zk.delete(event.getPath(), -1, null, null);
        } catch (Exception e) {
            SNAP_LOG.error("Error cleaning up user snapshot request response in ZK", e);
        }
        ByteBuffer buf = ByteBuffer.wrap(responseBytes);
        ClientResponseImpl response = new ClientResponseImpl();
        response.initFromBuffer(buf);
        response.setClientHandle(clientHandle);

        // Not sure if we need to preserve the original byte buffer here, playing it safe
        ByteBuffer buf2 = ByteBuffer.allocate(response.getSerializedSize() + 4);
        buf2.putInt(buf2.capacity() - 4);
        response.flattenToBuffer(buf2).flip();
        c.writeStream().enqueue(buf2);

        /*
         * If the caller wants to be notified of final results for the snapshot
         * request, set up a watcher only if the snapshot is queued.
         */
        if (notifyChanges && (response.getStatus() == ClientResponse.SUCCESS) &&
            SnapshotUtil.isSnapshotQueued(response.getResults())) {
            Watcher watcher = new Watcher() {
                @Override
                public void process(final WatchedEvent event) {
                    if (event.getState() == KeeperState.Disconnected) return;
                    switch (event.getType()) {
                    case NodeCreated:
                        m_es.submit(new Runnable() {
                            @Override
                            public void run() {
                                try {
                                    processUserSnapshotRequestResponse(
                                                event,
                                                clientHandle,
                                                c,
                                                false);
                                } catch (Exception e) {
                                    VoltDB.crashLocalVoltDB(
                                                "Error retrieving user snapshot request response from ZK",
                                                true,
                                                e);
                                }
                            }
                        });
                        break;
                    default:
                    }
                }
            };

            // Set the watcher
            if (m_zk.exists(event.getPath(), watcher) != null) {
                processUserSnapshotRequestResponse(event, clientHandle, c, false);
            }
        }
    }

    @Override
    public CountDownLatch snapshotCompleted(final SnapshotCompletionEvent event) {
        if (!event.truncationSnapshot || !event.didSucceed) {
            return new CountDownLatch(0);
        }
        final CountDownLatch latch = new CountDownLatch(1);
        m_es.execute(new Runnable() {
            @Override
            public void run() {
                try {
                    TruncationSnapshotAttempt snapshotAttempt = m_truncationSnapshotAttempts.get(event.multipartTxnId);
                    if (snapshotAttempt == null) {
                        snapshotAttempt = new TruncationSnapshotAttempt();
                        snapshotAttempt.path = event.path;
                        snapshotAttempt.nonce = event.nonce;
                        m_truncationSnapshotAttempts.put(event.multipartTxnId, snapshotAttempt);
                    }
                    snapshotAttempt.finished = true;
                    groomTruncationSnapshots();
                } finally {
                    latch.countDown();
                }
            }
        });
        return latch;
    }

}
TOP

Related Classes of org.voltdb.SnapshotDaemon$Snapshot

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
tics.com/analytics.js','ga'); ga('create', 'UA-20639858-1', 'auto'); ga('send', 'pageview');