Package org.voltdb.iv2

Source Code of org.voltdb.iv2.MpScheduler

/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
*/

package org.voltdb.iv2;

import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.collect.Sets;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.Mailbox;
import org.voltcore.messaging.TransactionInfoBaseMessage;
import org.voltcore.messaging.VoltMessage;
import org.voltdb.CatalogContext;
import org.voltdb.CatalogSpecificPlanner;
import org.voltdb.CommandLog;
import org.voltdb.SystemProcedureCatalog;
import org.voltdb.SystemProcedureCatalog.Config;
import org.voltdb.VoltDB;
import org.voltdb.VoltTable;
import org.voltdb.dtxn.TransactionState;
import org.voltdb.messaging.CompleteTransactionMessage;
import org.voltdb.messaging.FragmentResponseMessage;
import org.voltdb.messaging.FragmentTaskMessage;
import org.voltdb.messaging.InitiateResponseMessage;
import org.voltdb.messaging.Iv2EndOfLogMessage;
import org.voltdb.messaging.Iv2InitiateTaskMessage;
import org.voltdb.sysprocs.BalancePartitionsRequest;
import org.voltdb.utils.MiscUtils;

public class MpScheduler extends Scheduler
{
    static VoltLogger tmLog = new VoltLogger("TM");

    // null if running community, fallback to MpProcedureTask
    private static final Constructor<?> NpProcedureTaskConstructor = loadNpProcedureTaskClass();

    private final Map<Long, TransactionState> m_outstandingTxns =
        new HashMap<Long, TransactionState>();
    private final Map<Long, DuplicateCounter> m_duplicateCounters =
        new HashMap<Long, DuplicateCounter>();

    private final List<Long> m_iv2Masters;
    private final Map<Integer, Long> m_partitionMasters;
    private final List<Long> m_buddyHSIds;
    private int m_nextBuddy = 0;
    //Generator of pre-IV2ish timestamp based unique IDs
    private final UniqueIdGenerator m_uniqueIdGenerator;
    final private MpTransactionTaskQueue m_pendingTasks;

    // the current not-needed-any-more point of the repair log.
    long m_repairLogTruncationHandle = Long.MIN_VALUE;
    // We need to lag the current MP execution point by at least two committed TXN ids
    // since that's the first point we can be sure is safely agreed on by all nodes.
    // Let the one we can't be sure about linger here.  See ENG-4211 for more.
    long m_repairLogAwaitingCommit = Long.MIN_VALUE;

    MpScheduler(int partitionId, List<Long> buddyHSIds, SiteTaskerQueue taskQueue)
    {
        super(partitionId, taskQueue);
        m_pendingTasks = new MpTransactionTaskQueue(m_tasks, getCurrentTxnId());
        m_buddyHSIds = buddyHSIds;
        m_iv2Masters = new ArrayList<Long>();
        m_partitionMasters = Maps.newHashMap();
        m_uniqueIdGenerator = new UniqueIdGenerator(partitionId, 0);
    }

    void setMpRoSitePool(MpRoSitePool sitePool)
    {
        m_pendingTasks.setMpRoSitePool(sitePool);
    }

    void updateCatalog(String diffCmds, CatalogContext context, CatalogSpecificPlanner csp)
    {
        m_pendingTasks.updateCatalog(diffCmds, context, csp);
    }

    @Override
    public void shutdown()
    {
        // cancel any in-progress transaction by creating a fragement
        // response to roll back. This function must be called with
        // the deliver lock held to be correct. The null task should
        // never run; the site thread is expected to be told to stop.
        m_pendingTasks.shutdown();
        m_pendingTasks.repair(m_nullTask, m_iv2Masters, m_partitionMasters);
    }


    @Override
    public void updateReplicas(final List<Long> replicas, final Map<Integer, Long> partitionMasters)
    {
        // Handle startup and promotion semi-gracefully
        m_iv2Masters.clear();
        m_iv2Masters.addAll(replicas);
        m_partitionMasters.clear();
        m_partitionMasters.putAll(partitionMasters);
        if (!m_isLeader) {
            return;
        }

        // Stolen from SpScheduler.  Need to update the duplicate counters associated with any EveryPartitionTasks
        // Cleanup duplicate counters and collect DONE counters
        // in this list for further processing.
        List<Long> doneCounters = new LinkedList<Long>();
        for (Entry<Long, DuplicateCounter> entry : m_duplicateCounters.entrySet()) {
            DuplicateCounter counter = entry.getValue();
            int result = counter.updateReplicas(m_iv2Masters);
            if (result == DuplicateCounter.DONE) {
                doneCounters.add(entry.getKey());
            }
        }

        // Maintain the CI invariant that responses arrive in txnid order.
        Collections.sort(doneCounters);
        for (Long key : doneCounters) {
            DuplicateCounter counter = m_duplicateCounters.remove(key);
            VoltMessage resp = counter.getLastResponse();
            if (resp != null && resp instanceof InitiateResponseMessage) {
                InitiateResponseMessage msg = (InitiateResponseMessage)resp;
                if (msg.shouldCommit()) {
                    m_repairLogTruncationHandle = m_repairLogAwaitingCommit;
                    m_repairLogAwaitingCommit = msg.getTxnId();
                }
                m_outstandingTxns.remove(msg.getTxnId());
                m_mailbox.send(counter.m_destinationId, resp);
            }
            else {
                hostLog.warn("TXN " + counter.getTxnId() + " lost all replicas and " +
                        "had no responses.  This should be impossible?");
            }
        }

        MpRepairTask repairTask = new MpRepairTask((InitiatorMailbox)m_mailbox, replicas);
        m_pendingTasks.repair(repairTask, replicas, partitionMasters);
    }

    /**
     * Sequence the message for replay if it's for DR.
     * @return true if the message can be delivered directly to the scheduler,
     * false if the message was a duplicate
     */
    @Override
    public boolean sequenceForReplay(VoltMessage message)
    {
        boolean canDeliver = true;
        long sequenceWithTxnId = Long.MIN_VALUE;

        boolean dr = ((message instanceof TransactionInfoBaseMessage &&
                ((TransactionInfoBaseMessage)message).isForDR()));

        if (dr) {
            sequenceWithTxnId = ((TransactionInfoBaseMessage)message).getOriginalTxnId();
            InitiateResponseMessage dupe = m_replaySequencer.dedupe(sequenceWithTxnId,
                    (TransactionInfoBaseMessage) message);
            if (dupe != null) {
                canDeliver = false;
                // Duplicate initiate task message, send response
                m_mailbox.send(dupe.getInitiatorHSId(), dupe);
            }
            else {
                m_replaySequencer.updateLastSeenTxnId(sequenceWithTxnId,
                        (TransactionInfoBaseMessage) message);
                canDeliver = true;
            }
        }
        return canDeliver;
    }

    @Override
    public void deliver(VoltMessage message)
    {
        if (message instanceof Iv2InitiateTaskMessage) {
            handleIv2InitiateTaskMessage((Iv2InitiateTaskMessage)message);
        }
        else if (message instanceof InitiateResponseMessage) {
            handleInitiateResponseMessage((InitiateResponseMessage)message);
        }
        else if (message instanceof FragmentResponseMessage) {
            handleFragmentResponseMessage((FragmentResponseMessage)message);
        }
        else if (message instanceof Iv2EndOfLogMessage) {
            handleEOLMessage();
        }
        else {
            throw new RuntimeException("UNKNOWN MESSAGE TYPE, BOOM!");
        }
    }

    // MpScheduler expects to see initiations for multipartition procedures and
    // system procedures which are "every-partition", meaning that they run as
    // single-partition procedures at every partition, and the results are
    // aggregated/deduped here at the MPI.
    public void handleIv2InitiateTaskMessage(Iv2InitiateTaskMessage message)
    {
        final String procedureName = message.getStoredProcedureName();

        /*
         * If this is CL replay, use the txnid from the CL and use it to update the current txnid
         */
        long mpTxnId;
        //Timestamp is actually a pre-IV2ish style time based transaction id
        long timestamp = Long.MIN_VALUE;

        // Update UID if it's for replay
        if (message.isForReplay()) {
            timestamp = message.getUniqueId();
            m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(timestamp);
        } else if (message.isForDR()) {
            timestamp = message.getStoredProcedureInvocation().getOriginalUniqueId();
            // @LoadMultipartitionTable does not have a valid uid
            if (UniqueIdGenerator.getPartitionIdFromUniqueId(timestamp) == m_partitionId) {
                m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(timestamp);
            }
        }

        if (message.isForReplay()) {
            mpTxnId = message.getTxnId();
            setMaxSeenTxnId(mpTxnId);
        } else {
            TxnEgo ego = advanceTxnEgo();
            mpTxnId = ego.getTxnId();
            timestamp = m_uniqueIdGenerator.getNextUniqueId();
        }

        // Don't have an SP HANDLE at the MPI, so fill in the unused value
        Iv2Trace.logIv2InitiateTaskMessage(message, m_mailbox.getHSId(), mpTxnId, Long.MIN_VALUE);

        // Handle every-site system procedures (at the MPI)
        final Config sysprocConfig = SystemProcedureCatalog.listing.get(procedureName);
        if (sysprocConfig != null &&  sysprocConfig.getEverysite()) {
            // Send an SP initiate task to all remote sites
            final Long localId = m_mailbox.getHSId();
            Iv2InitiateTaskMessage sp = new Iv2InitiateTaskMessage(
                    localId, // make the MPI the initiator.
                    message.getCoordinatorHSId(),
                    m_repairLogTruncationHandle,
                    mpTxnId,
                    timestamp,
                    message.isReadOnly(),
                    true, // isSinglePartition
                    message.getStoredProcedureInvocation(),
                    message.getClientInterfaceHandle(),
                    message.getConnectionId(),
                    message.isForReplay());
            DuplicateCounter counter = new DuplicateCounter(
                    message.getInitiatorHSId(),
                    mpTxnId,
                    m_iv2Masters, message.getStoredProcedureName());
            m_duplicateCounters.put(mpTxnId, counter);
            EveryPartitionTask eptask =
                new EveryPartitionTask(m_mailbox, m_pendingTasks, sp,
                        m_iv2Masters);
            m_pendingTasks.offer(eptask);
            return;
        }
        // Create a copy so we can overwrite the txnID so the InitiateResponse will be
        // correctly tracked.
        Iv2InitiateTaskMessage mp =
            new Iv2InitiateTaskMessage(
                    message.getInitiatorHSId(),
                    message.getCoordinatorHSId(),
                    m_repairLogTruncationHandle,
                    mpTxnId,
                    timestamp,
                    message.isReadOnly(),
                    message.isSinglePartition(),
                    message.getStoredProcedureInvocation(),
                    message.getClientInterfaceHandle(),
                    message.getConnectionId(),
                    message.isForReplay());
        // Multi-partition initiation (at the MPI)
        MpProcedureTask task = null;
        if (isNpTxn(message) && NpProcedureTaskConstructor != null) {
            Set<Integer> involvedPartitions = getBalancePartitions(message);
            if (involvedPartitions != null) {
                HashMap<Integer, Long> involvedPartitionMasters = Maps.newHashMap(m_partitionMasters);
                involvedPartitionMasters.keySet().retainAll(involvedPartitions);

                task = instantiateNpProcedureTask(m_mailbox, procedureName,
                        m_pendingTasks, mp, involvedPartitionMasters,
                        m_buddyHSIds.get(m_nextBuddy), false);
            }

            // if cannot figure out the involved partitions, run it as an MP txn
        }

        if (task == null) {
            task = new MpProcedureTask(m_mailbox, procedureName,
                    m_pendingTasks, mp, m_iv2Masters, m_partitionMasters,
                    m_buddyHSIds.get(m_nextBuddy), false);
        }

        m_nextBuddy = (m_nextBuddy++) % m_buddyHSIds.size();
        m_outstandingTxns.put(task.m_txnState.txnId, task.m_txnState);
        m_pendingTasks.offer(task);
    }

    /**
     * Hacky way to only run @BalancePartitions as n-partition transactions for now.
     * @return true if it's an n-partition transaction
     */
    private boolean isNpTxn(Iv2InitiateTaskMessage msg)
    {
        return msg.getStoredProcedureName().startsWith("@") &&
                msg.getStoredProcedureName().equalsIgnoreCase("@BalancePartitions") &&
                (byte) msg.getParameters()[1] != 1; // clearIndex is MP, normal rebalance is NP
    }

    /**
     * Extract the two involved partitions from the @BalancePartitions request.
     */
    private Set<Integer> getBalancePartitions(Iv2InitiateTaskMessage msg)
    {
        try {
            JSONObject jsObj = new JSONObject((String) msg.getParameters()[0]);
            BalancePartitionsRequest request = new BalancePartitionsRequest(jsObj);

            return Sets.newHashSet(request.partitionPairs.get(0).srcPartition,
                    request.partitionPairs.get(0).destPartition);
        } catch (JSONException e) {
            hostLog.warn("Unable to determine partitions for @BalancePartitions", e);
            return null;
        }
    }

    @Override
    public void handleMessageRepair(List<Long> needsRepair, VoltMessage message)
    {
        if (message instanceof Iv2InitiateTaskMessage) {
            handleIv2InitiateTaskMessageRepair(needsRepair, (Iv2InitiateTaskMessage)message);
        }
        else {
            // MpInitiatorMailbox should throw RuntimeException for unhandled types before we could get here
            throw new RuntimeException("MpScheduler.handleMessageRepair() received unhandled message type." +
                    " This should be impossible");
        }
    }

    private void handleIv2InitiateTaskMessageRepair(List<Long> needsRepair, Iv2InitiateTaskMessage message)
    {
        // just reforward the Iv2InitiateTaskMessage for the txn being restarted
        // this copy may be unnecessary
        final String procedureName = message.getStoredProcedureName();
        Iv2InitiateTaskMessage mp =
            new Iv2InitiateTaskMessage(
                    message.getInitiatorHSId(),
                    message.getCoordinatorHSId(),
                    message.getTruncationHandle(),
                    message.getTxnId(),
                    message.getUniqueId(),
                    message.isReadOnly(),
                    message.isSinglePartition(),
                    message.getStoredProcedureInvocation(),
                    message.getClientInterfaceHandle(),
                    message.getConnectionId(),
                    message.isForReplay());
        m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(message.getUniqueId());
        // Multi-partition initiation (at the MPI)
        MpProcedureTask task = null;
        if (isNpTxn(message) && NpProcedureTaskConstructor != null) {
            Set<Integer> involvedPartitions = getBalancePartitions(message);
            if (involvedPartitions != null) {
                HashMap<Integer, Long> involvedPartitionMasters = Maps.newHashMap(m_partitionMasters);
                involvedPartitionMasters.keySet().retainAll(involvedPartitions);

                task = instantiateNpProcedureTask(m_mailbox, procedureName,
                        m_pendingTasks, mp, involvedPartitionMasters,
                        m_buddyHSIds.get(m_nextBuddy), true);
            }

            // if cannot figure out the involved partitions, run it as an MP txn
        }

        if (task == null) {
            task = new MpProcedureTask(m_mailbox, procedureName,
                    m_pendingTasks, mp, m_iv2Masters, m_partitionMasters,
                    m_buddyHSIds.get(m_nextBuddy), true);
        }

        m_nextBuddy = (m_nextBuddy++) % m_buddyHSIds.size();
        m_outstandingTxns.put(task.m_txnState.txnId, task.m_txnState);
        m_pendingTasks.offer(task);
    }

    // The MpScheduler will see InitiateResponseMessages from the Partition masters when
    // performing an every-partition system procedure.  A consequence of this deduping
    // is that the MpScheduler will also need to forward the final InitiateResponseMessage
    // for a normal multipartition procedure back to the client interface since it must
    // see all of these messages and control their transmission.
    public void handleInitiateResponseMessage(InitiateResponseMessage message)
    {
        DuplicateCounter counter = m_duplicateCounters.get(message.getTxnId());
        if (counter != null) {
            int result = counter.offer(message);
            if (result == DuplicateCounter.DONE) {
                m_duplicateCounters.remove(message.getTxnId());
                // Only advance the truncation point on committed transactions.  See ENG-4211
                if (message.shouldCommit()) {
                    m_repairLogTruncationHandle = m_repairLogAwaitingCommit;
                    m_repairLogAwaitingCommit = message.getTxnId();
                }
                m_outstandingTxns.remove(message.getTxnId());

                m_mailbox.send(counter.m_destinationId, message);
            }
            else if (result == DuplicateCounter.MISMATCH) {
                VoltDB.crashLocalVoltDB("HASH MISMATCH running every-site system procedure.", true, null);
            }
            // doing duplicate suppresion: all done.
        }
        else {
            // Only advance the truncation point on committed transactions.
            if (message.shouldCommit()) {
                m_repairLogTruncationHandle = m_repairLogAwaitingCommit;
                m_repairLogAwaitingCommit = message.getTxnId();
            }
            m_outstandingTxns.remove(message.getTxnId());
            // the initiatorHSId is the ClientInterface mailbox. Yeah. I know.
            m_mailbox.send(message.getInitiatorHSId(), message);
            // We actually completed this MP transaction.  Create a fake CompleteTransactionMessage
            // to send to our local repair log so that the fate of this transaction is never forgotten
            // even if all the masters somehow die before forwarding Complete on to their replicas.
            CompleteTransactionMessage ctm = new CompleteTransactionMessage(m_mailbox.getHSId(),
                    message.m_sourceHSId, message.getTxnId(), message.isReadOnly(), 0,
                    !message.shouldCommit(), false, false, false);
            ctm.setTruncationHandle(m_repairLogTruncationHandle);
            // dump it in the repair log
            // hacky castage
            ((MpInitiatorMailbox)m_mailbox).deliverToRepairLog(ctm);
        }
    }

    public void handleFragmentTaskMessage(FragmentTaskMessage message,
                                          Map<Integer, List<VoltTable>> inputDeps)
    {
        throw new RuntimeException("MpScheduler should never see a FragmentTaskMessage");
    }

    // MpScheduler will receive FragmentResponses from the partition masters, and needs
    // to offer them to the corresponding TransactionState so that the TransactionTask in
    // the runloop which is awaiting these responses can do dependency tracking and eventually
    // unblock.
    public void handleFragmentResponseMessage(FragmentResponseMessage message)
    {
        TransactionState txn = m_outstandingTxns.get(message.getTxnId());
        // We could already have received the CompleteTransactionMessage from
        // the local site and the transaction is dead, despite FragmentResponses
        // in flight from remote sites.  Drop those on the floor.
        // IZZY: After implementing BorrowTasks, I'm not sure that the above sequence
        // can actually happen any longer, but leaving this and logging it for now.
        // RTB: Didn't we decide early rollback can do this legitimately.
        if (txn != null) {
            ((MpTransactionState)txn).offerReceivedFragmentResponse(message);
        }
        else {
            hostLog.debug("MpScheduler received a FragmentResponseMessage for a null TXN ID: " + message);
        }
    }

    public void handleCompleteTransactionMessage(CompleteTransactionMessage message)
    {
        throw new RuntimeException("MpScheduler should never see a CompleteTransactionMessage");
    }

    /**
     * Inject a task into the transaction task queue to flush it. When it
     * executes, it will send out MPI end of log messages to all partition
     * initiators.
     */
    public void handleEOLMessage()
    {
        Iv2EndOfLogMessage msg = new Iv2EndOfLogMessage(m_partitionId);
        MPIEndOfLogTransactionState txnState = new MPIEndOfLogTransactionState(msg);
        MPIEndOfLogTask task = new MPIEndOfLogTask(m_mailbox, m_pendingTasks,
                                                   txnState, m_iv2Masters);
        m_pendingTasks.offer(task);
    }

    @Override
    public void setCommandLog(CommandLog cl) {
        // the MPI currently doesn't do command logging.  Don't have a reference to one.
    }

    @Override
    public void enableWritingIv2FaultLog() {
        // This is currently a no-op for the MPI
    }

    /**
     * Load the pro class for n-partition transactions.
     * @return null if running in community or failed to load the class
     */
    private static Constructor<?> loadNpProcedureTaskClass()
    {
        Class<?> klass = MiscUtils.loadProClass("org.voltdb.iv2.NpProcedureTask", "N-Partition", !MiscUtils.isPro());
        if (klass != null) {
            try {
                return klass.getConstructor(Mailbox.class, String.class, TransactionTaskQueue.class,
                        Iv2InitiateTaskMessage.class, Map.class, long.class, boolean.class);
            } catch (NoSuchMethodException e) {
                hostLog.error("Unabled to get the constructor for pro class NpProcedureTask", e);
                return null;
            }
        } else {
            return null;
        }
    }

    private static MpProcedureTask instantiateNpProcedureTask(Object...params)
    {
        if (NpProcedureTaskConstructor != null) {
            try {
                return (MpProcedureTask) NpProcedureTaskConstructor.newInstance(params);
            } catch (Exception e) {
                tmLog.error("Unable to instantiate NpProcedureTask", e);
            }
        }
        return null;
    }
}
TOP

Related Classes of org.voltdb.iv2.MpScheduler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.