Package org.voltdb.compiler

Source Code of org.voltdb.compiler.ClusterConfig$Node

/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.compiler;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.json_voltpatches.JSONArray;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.json_voltpatches.JSONStringer;
import org.voltcore.logging.VoltLogger;
import org.voltdb.VoltDB;

import com.google_voltpatches.common.collect.Multimap;

public class ClusterConfig
{
    private static final VoltLogger hostLog = new VoltLogger("HOST");

    public static List<Integer> partitionsForHost(JSONObject topo, int hostId) throws JSONException
    {
        List<Integer> partitions = new ArrayList<Integer>();

        JSONArray parts = topo.getJSONArray("partitions");

        for (int p = 0; p < parts.length(); p++) {
            // have an object in the partitions array
            JSONObject aPartition = parts.getJSONObject(p);
            int pid = aPartition.getInt("partition_id");
            JSONArray replicas = aPartition.getJSONArray("replicas");
            for (int h = 0; h < replicas.length(); h++)
            {
                int replica = replicas.getInt(h);
                if (replica == hostId)
                {
                    partitions.add(pid);
                }
            }
        }

        return partitions;
    }

    /**
     * Add a list of hosts to the current topology.
     *
     * This method modifies the topology in place.
     *
     * @param newHosts The number of new hosts to add
     * @param topo The existing topology, which will have the host count updated in-place.
     */
    public static void addHosts(int newHosts, JSONObject topo) throws JSONException
    {
        ClusterConfig config = new ClusterConfig(topo);
        int kfactor = config.getReplicationFactor();

        if (newHosts != kfactor + 1) {
            VoltDB.crashLocalVoltDB("Only adding " + (kfactor + 1) + " nodes at a time is " +
                    "supported, currently trying to add " + newHosts, false, null);
        }

        // increase host count
        topo.put("hostcount", config.getHostCount() + newHosts);
    }

    /**
     * Add new partitions to the topology.
     * @param topo          The topology that will be added to.
     * @param partToHost    A map of new partitions to their corresponding replica host IDs.
     * @throws JSONException
     */
    public static void addPartitions(JSONObject topo, Multimap<Integer, Integer> partToHost)
        throws JSONException
    {
        JSONArray partitions = topo.getJSONArray("partitions");
        for (Map.Entry<Integer, Collection<Integer>> e : partToHost.asMap().entrySet()) {
            int partition = e.getKey();
            Collection<Integer> hosts = e.getValue();

            JSONObject partObj = new JSONObject();
            partObj.put("partition_id", partition);
            partObj.put("replicas", hosts);

            partitions.put(partObj);
        }
    }

    public ClusterConfig(int hostCount, int sitesPerHost, int replicationFactor)
    {
        m_hostCount = hostCount;
        m_sitesPerHost = sitesPerHost;
        m_replicationFactor = replicationFactor;
        m_errorMsg = "Config is unvalidated";
    }

    // Construct a ClusterConfig object from the JSON topology.  The computations
    // for this object are currently deterministic given the three values below, so
    // this all magically works.  If you change that fact, good luck Chuck.
    public ClusterConfig(JSONObject topo) throws JSONException
    {
        m_hostCount = topo.getInt("hostcount");
        m_sitesPerHost = topo.getInt("sites_per_host");
        m_replicationFactor = topo.getInt("kfactor");
        m_errorMsg = "Config is unvalidated";
    }

    public int getHostCount()
    {
        return m_hostCount;
    }

    public int getSitesPerHost()
    {
        return m_sitesPerHost;
    }

    public int getReplicationFactor()
    {
        return m_replicationFactor;
    }

    public int getPartitionCount()
    {
        return (m_hostCount * m_sitesPerHost) / (m_replicationFactor + 1);
    }

    public String getErrorMsg()
    {
        return m_errorMsg;
    }

    public boolean validate()
    {
        if (m_hostCount <= 0)
        {
            m_errorMsg = "The number of hosts must be > 0.";
            return false;
        }
        if (m_sitesPerHost <= 0)
        {
            m_errorMsg = "The number of sites per host must be > 0.";
            return false;
        }
        if (m_hostCount <= m_replicationFactor)
        {
            m_errorMsg = String.format("%d servers required for K-safety=%d",
                                       m_replicationFactor + 1, m_replicationFactor);
            return false;
        }
        if (getPartitionCount() == 0)
        {
            m_errorMsg = String.format("Insufficient execution site count to achieve K-safety of %d",
                                       m_replicationFactor);
            return false;
        }
        m_errorMsg = "Cluster config contains no detected errors";
        return true;
    }

    public boolean validate(int origStartCount)
    {
        boolean isValid = validate();
        if (isValid && origStartCount < m_hostCount && origStartCount > 0)
        {
            if ((m_hostCount - origStartCount) > m_replicationFactor + 1)
            {
                m_errorMsg = String.format("You can only add %d servers at a time for k=&d",
                        m_replicationFactor + 1, m_replicationFactor);
                return false;
            }
            else if ((m_hostCount - origStartCount) % (m_replicationFactor + 1) != 0)
            {
                m_errorMsg = String.format("Must add %d servers at a time for k=%d",
                        m_replicationFactor + 1, m_replicationFactor);
                return false;
            }
        }
        return isValid;
    }

    private static class Partition {
        private Node m_master;
        private final Set<Node> m_replicas = new HashSet<Node>();
        private final Integer m_partitionId;

        private int m_neededReplicas;

        public Partition(Integer partitionId, int neededReplicas) {
            m_partitionId = partitionId;
            m_neededReplicas = neededReplicas;
        }

        boolean needsReplicas() {
            return m_neededReplicas > 0;
        }

        @Override
        public int hashCode() {
            return m_partitionId.hashCode();
        }

        public void decrementNeededReplicas() {
            if (m_neededReplicas == 0) {
                throw new RuntimeException("ClusterConfig error: Attempted to replicate a partition too many times");
            }
            m_neededReplicas--;
        }

        public boolean canUseAsReplica(Node n) {
            return needsReplicas() && m_master != n && !m_replicas.contains(n);
        }

        @Override
        public boolean equals(Object o) {
            if (o instanceof Partition) {
                Partition p = (Partition)o;
                return m_partitionId.equals(p.m_partitionId);
            }
            return false;
        }

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("Partition " + m_partitionId + " needing replicas " + m_neededReplicas);
            sb.append(" with master " + m_master.m_hostId + " and replicas ");
            for (Node n : m_replicas) {
                sb.append(n.m_hostId).append(", ");
            }
            return sb.toString();
        }
    }

    private static class Node {
        Set<Partition> m_masterPartitions = new HashSet<Partition>();
        Set<Partition> m_replicaPartitions = new HashSet<Partition>();
        Map<Node, Set<Partition>> m_replicationConnections = new HashMap<Node, Set<Partition>>();
        Integer m_hostId;

        public Node(Integer hostId) {
            m_hostId = hostId;
        }

        int partitionCount() {
            return m_masterPartitions.size() + m_replicaPartitions.size();
        }

        @Override
        public int hashCode() {
            return m_hostId.hashCode();
        }

        @Override
        public boolean equals(Object o) {
            if (o instanceof Node) {
                Node n = (Node)o;
                return m_hostId.equals(n.m_hostId);
            }
            return false;
        }

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("Host " + m_hostId + " master of ");
            for (Partition p : m_masterPartitions) {
                sb.append(p.m_partitionId).append(", ");
            }
            sb.append(" replica of ");
            for (Partition p : m_replicaPartitions) {
                sb.append(p.m_partitionId).append(", ");
            }
            sb.append(" connected to ");
            for (Map.Entry<Node, Set<Partition>> entry : m_replicationConnections.entrySet()) {
                sb.append(" host " + entry.getKey().m_hostId + " for partitions ");
                for (Partition p : entry.getValue()) {
                    sb.append(p.m_partitionId).append(", ");
                }
                sb.append(";");
            }
            return sb.toString();
        }
    }

    /*
     * Are there any partitions that are not fully replicated
     */
    private static boolean needReplication(List<Partition> partitions) {
        for (Partition p : partitions) {
            if (p.needsReplicas()) {
                return true;
            }
        }
        return false;
    }

    /*
     * Find the node that can take more replicas,
     * has the least number of replication connections,
     * and where the number of replication connections are equal,
     * where the number of replicated partitions is smallest
     */
    Node nextNotFullNode(List<Node> nodes, int sitesPerNode) {
        ArrayList<Node> notFullList = new ArrayList<Node>();
        for (Node n : nodes) {
            if (n.partitionCount() < sitesPerNode) {
                notFullList.add(n);
            }
        }

        Node leastConnectedNode = null;
        for (Node n : notFullList) {
            if (leastConnectedNode == null) {
                leastConnectedNode = n;
                continue;
            }

            /*
             * Pick the one with the fewest connections, and for those that have the same number
             * of connections, pick the one that is replicating the fewest partitions
             */
            if (n.m_replicationConnections.size() <= leastConnectedNode.m_replicationConnections.size()) {
                int sumA = 0;
                for (Set<Partition> replicas : n.m_replicationConnections.values()) {
                    sumA += replicas.size();
                }
                int sumB = 0;
                for (Set<Partition> replicas : leastConnectedNode.m_replicationConnections.values()) {
                    sumB += replicas.size();
                }
                if (sumA < sumB) {
                    leastConnectedNode = n;
                }
            }
        }
        return leastConnectedNode;
    }


    /*
     * Original placement strategy that doesn't get very good performance
     */
    JSONObject fallbackPlacementStrategy(
            List<Integer> hostIds,
            int hostCount,
            int partitionCount,
            int sitesPerHost) throws JSONException{
        // add all the sites
        int partitionCounter = -1;

        HashMap<Integer, ArrayList<Integer>> partToHosts =
            new HashMap<Integer, ArrayList<Integer>>();
        for (int i = 0; i < partitionCount; i++)
        {
            ArrayList<Integer> hosts = new ArrayList<Integer>();
            partToHosts.put(i, hosts);
        }
        for (int i = 0; i < sitesPerHost * hostCount; i++) {

            // serially assign partitions to execution sites.
            int partition = (++partitionCounter) % partitionCount;
            int hostForSite = hostIds.get(i / sitesPerHost);
            partToHosts.get(partition).add(hostForSite);
        }

        // We need to sort the hostID lists for each partition so that
        // the leader assignment magic in the loop below will work.
        for (Map.Entry<Integer, ArrayList<Integer>> e : partToHosts.entrySet()) {
            Collections.sort(e.getValue());
        }

        JSONStringer stringer = new JSONStringer();
        stringer.object();
        stringer.key("hostcount").value(m_hostCount);
        stringer.key("kfactor").value(getReplicationFactor());
        stringer.key("sites_per_host").value(sitesPerHost);
        stringer.key("partitions").array();
        for (int part = 0; part < partitionCount; part++)
        {
            stringer.object();
            stringer.key("partition_id").value(part);
            // This two-line magic deterministically spreads the partition leaders
            // evenly across the cluster at startup.
            int index = part % (getReplicationFactor() + 1);
            int master = partToHosts.get(part).get(index);
            stringer.key("master").value(master);
            stringer.key("replicas").array();
            for (int host_pos : partToHosts.get(part)) {
                stringer.value(host_pos);
            }
            stringer.endArray();
            stringer.endObject();
        }
        stringer.endArray();
        stringer.endObject();
        JSONObject topo = new JSONObject(stringer.toString());
        return topo;
    }

    /*
     * Placement strategy that attempts to involve multiple nodes in replication
     * so that the socket between nodes is not a bottleneck.
     */
    JSONObject newPlacementStrategy(
            List<Integer> hostIds,
            int hostCount,
            int partitionCount,
            int sitesPerHost) throws JSONException {
        Collections.sort(hostIds);
        List<Partition> partitions = new ArrayList<Partition>();
        for (int ii = 0; ii < partitionCount; ii++) {
            partitions.add(new Partition(ii, getReplicationFactor() + 1));
        }

        List<Node> nodes = new ArrayList<Node>();
        for (Integer hostId : hostIds) {
            nodes.add(new Node(hostId));
        }

        /*
         * Distribute master ship
         */
        for(int ii = 0; ii < partitions.size(); ii++) {
            Partition p = partitions.get(ii);
            Node n = nodes.get(ii % hostCount);
            p.m_master = n;
            p.decrementNeededReplicas();
            n.m_masterPartitions.add(p);
        }

        while (needReplication(partitions)) {
            Node n = nextNotFullNode(nodes, sitesPerHost);

            //Find a partition that will increase number of hosts inter-replicating
            boolean foundUsefulPartition = false;
            Partition partitionToUse = null;
            for (Partition p : partitions) {
                if (p.canUseAsReplica(n)) {
                    if (!p.m_master.m_replicationConnections.containsKey(n)) {
                        foundUsefulPartition = true;
                        partitionToUse = p;
                    }
                }
            }

            if (!foundUsefulPartition) {
                //Fall back to finding any old thing to replicate
                for (Partition p : partitions) {
                    if (p.canUseAsReplica(n)) {
                        partitionToUse = p;
                        break;
                    }
                }
                Set<Partition> replicatedPartitions = partitionToUse.m_master.m_replicationConnections.get(n);
                replicatedPartitions.add(partitionToUse);
                partitionToUse.m_replicas.add(n);
                n.m_replicaPartitions.add(partitionToUse);
                partitionToUse.decrementNeededReplicas();
            } else {
                //Connect the partition, node, and master together
                Set<Partition> replicatedPartitions = new HashSet<Partition>();
                replicatedPartitions.add(partitionToUse);
                partitionToUse.m_master.m_replicationConnections.put(n, replicatedPartitions);
                n.m_replicationConnections.put(partitionToUse.m_master, replicatedPartitions);
                n.m_replicaPartitions.add(partitionToUse);
                partitionToUse.decrementNeededReplicas();
                partitionToUse.m_replicas.add(n);
            }
        }

        JSONStringer stringer = new JSONStringer();
        stringer.object();
        stringer.key("hostcount").value(m_hostCount);
        stringer.key("kfactor").value(getReplicationFactor());
        stringer.key("sites_per_host").value(sitesPerHost);
        stringer.key("partitions").array();
        for (int part = 0; part < partitionCount; part++)
        {
            stringer.object();
            stringer.key("partition_id").value(part);
            stringer.key("master").value(partitions.get(part).m_master.m_hostId);
            stringer.key("replicas").array();
            for (Node n : partitions.get(part).m_replicas) {
                stringer.value(n.m_hostId);
            }
            stringer.value(partitions.get(part).m_master.m_hostId);
            stringer.endArray();
            stringer.endObject();
        }
        stringer.endArray();
        stringer.endObject();

        JSONObject topo = new JSONObject(stringer.toString());
        return topo;
    }

    // Statically build a topology. This only runs at startup;
    // rejoin clones this from an existing server.
    public JSONObject getTopology(List<Integer> hostIds) throws JSONException
    {
        int hostCount = getHostCount();
        int partitionCount = getPartitionCount();
        int sitesPerHost = getSitesPerHost();

        if (hostCount != hostIds.size()) {
            throw new RuntimeException("Provided " + hostIds.size() + " host ids when host count is " + hostCount);
        }

        boolean useFallbackStrategy = Boolean.valueOf(System.getenv("VOLT_REPLICA_FALLBACK"));
        if ((sitesPerHost * hostCount) % (getReplicationFactor() + 1) > 0) {
            VoltDB.crashGlobalVoltDB("The cluster has more hosts and sites per hosts than required for the " +
                    "requested k-safety value.  The number of total sites (sitesPerHost * hostCount) must be a " +
                    "whole multiple of the number of copies of the database (k-safety + 1)", false, null);
        }
        if (sitesPerHost * hostCount % partitionCount > 0 || partitionCount < hostCount) {
            hostLog.warn("Unable to use optimal replica placement strategy with this configuration. " +
                    " Falling back to a less optimal strategy that may result in worse performance. " +
                    " Try using an even number of sites per host.");
            useFallbackStrategy = true;
        }

        JSONObject topo = null;
        if (useFallbackStrategy) {
            topo = fallbackPlacementStrategy(hostIds, hostCount, partitionCount, sitesPerHost);
        } else {
            try {
                topo = newPlacementStrategy(hostIds, hostCount, partitionCount, sitesPerHost);
            } catch (Exception e) {
                hostLog.error("Unable to use optimal replica placement strategy. " +
                              "Falling back to a less optimal strategy that may result in worse performance");
                topo = fallbackPlacementStrategy(hostIds, hostCount, partitionCount, sitesPerHost);
            }
        }

        hostLog.debug("TOPO: " + topo.toString(2));
        return topo;
    }

    private final int m_hostCount;
    private final int m_sitesPerHost;
    private final int m_replicationFactor;

    private String m_errorMsg;
}
TOP

Related Classes of org.voltdb.compiler.ClusterConfig$Node

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.