Package voldemort.client.rebalance

Source Code of voldemort.client.rebalance.RebalancePlan

/*
* Copyright 2008-2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package voldemort.client.rebalance;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.log4j.Logger;

import voldemort.cluster.Cluster;
import voldemort.cluster.Node;
import voldemort.store.StoreDefinition;
import voldemort.tools.PartitionBalance;
import voldemort.utils.MoveMap;
import voldemort.utils.PartitionBalanceUtils;
import voldemort.utils.RebalanceUtils;
import voldemort.utils.UpdateClusterUtils;
import voldemort.utils.Utils;

import com.google.common.collect.Lists;
import com.google.common.collect.TreeMultimap;

/**
* RebalancePlan encapsulates all aspects of planning a shuffle, cluster
* expansion, or zone expansion.
*/
public class RebalancePlan {

    private static final Logger logger = Logger.getLogger(RebalancePlan.class);

    /**
     * The number of "primary" partition IDs to move in each batch of the plan.
     * Moving a primary partition ID between nodes results in between zero and
     * (# of zones) * (2) * (# stores) partition-stores being moved. The (2)
     * comes from an upper bound of a single move affecting two-nodes per zone.
     */
    public final static int BATCH_SIZE = Integer.MAX_VALUE;

    private final Cluster currentCluster;
    private final List<StoreDefinition> currentStoreDefs;
    private final Cluster finalCluster;
    private final List<StoreDefinition> finalStoreDefs;
    private final int batchSize;
    private final String outputDir;

    private List<RebalanceBatchPlan> batchPlans;

    // Aggregate stats
    private int numPrimaryPartitionMoves;
    private int numPartitionStoreMoves;
    private int numXZonePartitionStoreMoves;
    private final MoveMap nodeMoveMap;
    private final MoveMap zoneMoveMap;

    /**
     * Constructs a plan for the specified change from currentCluster/StoreDefs
     * to finalCluster/StoreDefs.
     *
     * finalStoreDefs are needed for the zone expansion use case since store
     * definitions depend on the number of zones.
     *
     * In theory, since current & final StoreDefs are passed in, this plan could
     * be used to transform deployed store definitions. In practice, this use
     * case has not been tested.
     *
     * @param currentCluster current deployed cluster.
     * @param currentStoreDefs current deployed store defs
     * @param finalCluster desired deployed cluster
     * @param finalStoreDefs desired deployed store defs
     * @param batchSize number of primary partitions to move in each batch.
     * @param outputDir directory in which to dump metadata files for the plan
     */
    public RebalancePlan(final Cluster currentCluster,
                         final List<StoreDefinition> currentStoreDefs,
                         final Cluster finalCluster,
                         final List<StoreDefinition> finalStoreDefs,
                         int batchSize,
                         String outputDir) {
        this.currentCluster = currentCluster;
        this.currentStoreDefs = RebalanceUtils.validateRebalanceStore(currentStoreDefs);
        this.finalCluster = finalCluster;
        this.finalStoreDefs = RebalanceUtils.validateRebalanceStore(finalStoreDefs);
        this.batchSize = batchSize;
        this.outputDir = outputDir;

        // Derive the interimCluster from current & final cluster xml
        RebalanceUtils.validateCurrentFinalCluster(this.currentCluster, this.finalCluster);
        Cluster interimCluster = RebalanceUtils.getInterimCluster(this.currentCluster,
                                                                  this.finalCluster);

        // Verify each cluster/storedefs pair
        RebalanceUtils.validateClusterStores(this.currentCluster, this.currentStoreDefs);
        RebalanceUtils.validateClusterStores(this.finalCluster, this.finalStoreDefs);
        RebalanceUtils.validateClusterStores(interimCluster, this.finalStoreDefs);

        // Log key arguments
        logger.info("Current cluster : " + currentCluster);
        logger.info("Interim cluster : " + interimCluster);
        logger.info("Final cluster : " + finalCluster);
        logger.info("Batch size : " + batchSize);

        // Initialize the plan
        batchPlans = new ArrayList<RebalanceBatchPlan>();

        // Initialize aggregate statistics
        numPrimaryPartitionMoves = 0;
        numPartitionStoreMoves = 0;
        numXZonePartitionStoreMoves = 0;
        nodeMoveMap = new MoveMap(interimCluster.getNodeIds());
        zoneMoveMap = new MoveMap(interimCluster.getZoneIds());

        plan();
        // write plan to file
        RebalanceUtils.dumpPlanToFile(this.outputDir, this);
    }

    public RebalancePlan(final Cluster currentCluster,
                         final List<StoreDefinition> currentStores,
                         final Cluster finalCluster,
                         int batchSize,
                         String outputDir) {
        this(currentCluster, currentStores, finalCluster, currentStores, batchSize, outputDir);
    }

    /**
     * Create a plan. The plan consists of batches. Each batch involves the
     * movement of no more than batchSize primary partitions. The movement of a
     * single primary partition may require migration of other n-ary replicas,
     * and potentially deletions. Migrating a primary or n-ary partition
     * requires migrating one partition-store for every store hosted at that
     * partition.
     *
     */
    private void plan() {
        // Mapping of stealer node to list of primary partitions being moved
        final TreeMultimap<Integer, Integer> stealerToStolenPrimaryPartitions = TreeMultimap.create();

        // Output initial and final cluster
        if(outputDir != null)
            RebalanceUtils.dumpClusters(currentCluster, finalCluster, outputDir);

        // Determine which partitions must be stolen
        for(Node stealerNode: finalCluster.getNodes()) {
            List<Integer> stolenPrimaryPartitions = RebalanceUtils.getStolenPrimaryPartitions(currentCluster,
                                                                                              finalCluster,
                                                                                              stealerNode.getId());
            if(stolenPrimaryPartitions.size() > 0) {
                numPrimaryPartitionMoves += stolenPrimaryPartitions.size();
                stealerToStolenPrimaryPartitions.putAll(stealerNode.getId(),
                                                        stolenPrimaryPartitions);
            }
        }

        // Determine plan batch-by-batch
        int batches = 0;
        Cluster batchCurrentCluster = Cluster.cloneCluster(currentCluster);
        List<StoreDefinition> batchCurrentStoreDefs = this.currentStoreDefs;
        List<StoreDefinition> batchFinalStoreDefs = this.finalStoreDefs;
        Cluster batchFinalCluster = RebalanceUtils.getInterimCluster(this.currentCluster,
                                                                     this.finalCluster);

        while(!stealerToStolenPrimaryPartitions.isEmpty()) {

            int partitions = 0;
            List<Entry<Integer, Integer>> partitionsMoved = Lists.newArrayList();
            for(Entry<Integer, Integer> stealerToPartition: stealerToStolenPrimaryPartitions.entries()) {
                partitionsMoved.add(stealerToPartition);
                batchFinalCluster = UpdateClusterUtils.createUpdatedCluster(batchFinalCluster,
                                                                            stealerToPartition.getKey(),
                                                                            Lists.newArrayList(stealerToPartition.getValue()));
                partitions++;
                if(partitions == batchSize)
                    break;
            }

            // Remove the partitions moved
            for(Iterator<Entry<Integer, Integer>> partitionMoved = partitionsMoved.iterator(); partitionMoved.hasNext();) {
                Entry<Integer, Integer> entry = partitionMoved.next();
                stealerToStolenPrimaryPartitions.remove(entry.getKey(), entry.getValue());
            }

            if(outputDir != null)
                RebalanceUtils.dumpClusters(batchCurrentCluster,
                                            batchFinalCluster,
                                            outputDir,
                                            "batch-" + Integer.toString(batches) + ".");

            // Generate a plan to compute the tasks
            final RebalanceBatchPlan RebalanceBatchPlan = new RebalanceBatchPlan(batchCurrentCluster,
                                                                                 batchCurrentStoreDefs,
                                                                                 batchFinalCluster,
                                                                                 batchFinalStoreDefs);
            batchPlans.add(RebalanceBatchPlan);

            numXZonePartitionStoreMoves += RebalanceBatchPlan.getCrossZonePartitionStoreMoves();
            numPartitionStoreMoves += RebalanceBatchPlan.getPartitionStoreMoves();
            nodeMoveMap.add(RebalanceBatchPlan.getNodeMoveMap());
            zoneMoveMap.add(RebalanceBatchPlan.getZoneMoveMap());

            batches++;
            batchCurrentCluster = Cluster.cloneCluster(batchFinalCluster);
            // batchCurrentStoreDefs can only be different from
            // batchFinalStoreDefs for the initial batch.
            batchCurrentStoreDefs = batchFinalStoreDefs;
        }

        logger.info(this);
    }

    /**
     * Determines storage overhead and returns pretty printed summary.
     *
     * @param finalNodeToOverhead Map of node IDs from final cluster to number
     *        of partition-stores to be moved to the node.
     * @return pretty printed string summary of storage overhead.
     */
    private String storageOverhead(Map<Integer, Integer> finalNodeToOverhead) {
        double maxOverhead = Double.MIN_VALUE;
        PartitionBalance pb = new PartitionBalance(currentCluster, currentStoreDefs);
        StringBuilder sb = new StringBuilder();
        sb.append("Per-node store-overhead:").append(Utils.NEWLINE);
        DecimalFormat doubleDf = new DecimalFormat("####.##");
        for(int nodeId: finalCluster.getNodeIds()) {
            Node node = finalCluster.getNodeById(nodeId);
            String nodeTag = "Node " + String.format("%4d", nodeId) + " (" + node.getHost() + ")";
            int initialLoad = 0;
            if(currentCluster.getNodeIds().contains(nodeId)) {
                initialLoad = pb.getNaryPartitionCount(nodeId);
            }
            int toLoad = 0;
            if(finalNodeToOverhead.containsKey(nodeId)) {
                toLoad = finalNodeToOverhead.get(nodeId);
            }
            double overhead = (initialLoad + toLoad) / (double) initialLoad;
            if(initialLoad > 0 && maxOverhead < overhead) {
                maxOverhead = overhead;
            }

            String loadTag = String.format("%6d", initialLoad) + " + "
                             + String.format("%6d", toLoad) + " -> "
                             + String.format("%6d", initialLoad + toLoad) + " ("
                             + doubleDf.format(overhead) + " X)";
            sb.append(nodeTag + " : " + loadTag).append(Utils.NEWLINE);
        }
        sb.append(Utils.NEWLINE)
          .append("****  Max per-node storage overhead: " + doubleDf.format(maxOverhead) + " X.")
          .append(Utils.NEWLINE);
        return (sb.toString());
    }

    public Cluster getCurrentCluster() {
        return currentCluster;
    }

    public List<StoreDefinition> getCurrentStores() {
        return currentStoreDefs;
    }

    public Cluster getFinalCluster() {
        return finalCluster;
    }

    public List<StoreDefinition> getFinalStores() {
        return finalStoreDefs;
    }

    /**
     *
     * @return The plan!
     */
    public List<RebalanceBatchPlan> getPlan() {
        return batchPlans;
    }

    /**
     * Total number of rebalancing tasks in the plan.
     *
     * @return number of rebalancing tasks in the plan.
     */
    public int taskCount() {
        int numTasks = 0;
        for(RebalanceBatchPlan batchPlan: batchPlans) {
            numTasks += batchPlan.getTaskCount();
        }
        return numTasks;
    }

    public int getPrimariesMoved() {
        return numPrimaryPartitionMoves;
    }

    public int getPartitionStoresMoved() {
        return numPartitionStoreMoves;
    }

    public int getPartitionStoresMovedXZone() {
        return numXZonePartitionStoreMoves;
    }

    public MoveMap getNodeMoveMap() {
        return nodeMoveMap;
    }

    public MoveMap getZoneMoveMap() {
        return zoneMoveMap;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        // Add entire plan batch-by-batch, partition info-by-partition info...
        for(RebalanceBatchPlan batchPlan: batchPlans) {
            sb.append(batchPlan).append(Utils.NEWLINE);
        }
        // Add invalid metadata rate analysis
        sb.append(PartitionBalanceUtils.analyzeInvalidMetadataRate(currentCluster,
                                                                   currentStoreDefs,
                                                                   finalCluster,
                                                                   finalStoreDefs));
        // Summarize aggregate plan stats
        sb.append("Total number of primary partition moves : " + numPrimaryPartitionMoves)
          .append(Utils.NEWLINE)
          .append("Total number of rebalance tasks : " + taskCount())
          .append(Utils.NEWLINE)
          .append("Total number of partition-store moves : " + numPartitionStoreMoves)
          .append(Utils.NEWLINE)
          .append("Total number of cross-zone partition-store moves :"
                  + numXZonePartitionStoreMoves)
          .append(Utils.NEWLINE)
          .append("Zone move map (partition-stores):")
          .append(Utils.NEWLINE)
          .append("(zone id) -> (zone id) = # of partition-stores moving from the former zone to the latter")
          .append(Utils.NEWLINE)
          .append(zoneMoveMap)
          .append(Utils.NEWLINE)
          .append("Node flow map (partition-stores):")
          .append(Utils.NEWLINE)
          .append("# partitions-stores stealing into -> (node id) -> # of partition-stores donating out of")
          .append(Utils.NEWLINE)
          .append(nodeMoveMap.toFlowString())
          .append(Utils.NEWLINE)
          .append(storageOverhead(nodeMoveMap.groupByTo()))
          .append(Utils.NEWLINE);

        return sb.toString();
    }
}
TOP

Related Classes of voldemort.client.rebalance.RebalancePlan

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.