Package org.voltdb.sysprocs.saverestore

Source Code of org.voltdb.sysprocs.saverestore.PartitionedTableSaveFileState

/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
*/

package org.voltdb.sysprocs.saverestore;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;

import org.voltcore.logging.VoltLogger;
import org.voltcore.utils.Pair;
import org.voltdb.ParameterSet;
import org.voltdb.VoltDB;
import org.voltdb.VoltSystemProcedure.SynthesizedPlanFragment;
import org.voltdb.VoltTableRow;
import org.voltdb.catalog.Table;
import org.voltdb.dtxn.SiteTracker;
import org.voltdb.sysprocs.SysProcFragmentId;



public class PartitionedTableSaveFileState extends TableSaveFileState
{
    private static final VoltLogger LOG = new VoltLogger(PartitionedTableSaveFileState.class.getName());
    private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT");

    public PartitionedTableSaveFileState(String tableName, long txnId)
    {
        super(tableName, txnId);
    }

    @Override
    void addHostData(VoltTableRow row) throws IOException
    {
        assert(row.getString("TABLE").equals(getTableName()));

        if (m_totalPartitions == 0)
        {
            // XXX this cast should be okay unless we exceed MAX_INT partitions
            m_totalPartitions = (int) row.getLong("TOTAL_PARTITIONS");
        }
        checkSiteConsistency(row); // throws if inconsistent

        int originalPartitionId = (int) row.getLong("PARTITION");
        m_partitionsSeen.add(originalPartitionId);
        int currentHostId = (int) row.getLong("CURRENT_HOST_ID");
        Set<Pair<Integer, Integer>> partitions_at_host = null;
        if (!(m_partitionsAtHost.containsKey(currentHostId))) {
            partitions_at_host = new HashSet<Pair<Integer, Integer>>();
            m_partitionsAtHost.put( currentHostId, partitions_at_host);
        }
        partitions_at_host = m_partitionsAtHost.get(currentHostId);

        partitions_at_host.add(
                Pair.of(
                        originalPartitionId,
                        (int) row.getLong("ORIGINAL_HOST_ID")));
    }

    @Override
    public boolean isConsistent()
    {
        boolean consistent =
            ((m_partitionsSeen.size() == m_totalPartitions) &&
             (m_partitionsSeen.first() == 0) &&
             (m_partitionsSeen.last() == m_totalPartitions - 1));
        if (!consistent)
        {
            m_consistencyResult = "Table: " + getTableName() +
                " is missing " + (m_totalPartitions - m_partitionsSeen.size()) +
                " out of " + m_totalPartitions + " total partitions" +
                " (partitions seen: " + m_partitionsSeen + ")";

        }
        else
        {
            m_consistencyResult = "Table: " + getTableName() +
                " has consistent savefile state.";
        }
        return consistent;
    }

    int getTotalPartitions()
    {
        return m_totalPartitions;
    }

    @Override
    public SynthesizedPlanFragment[]
    generateRestorePlan(Table catalogTable, SiteTracker st)
    {
        SynthesizedPlanFragment[] restore_plan = null;
        LOG.info("Total partitions for Table: " + getTableName() + ": " +
                 getTotalPartitions());
        if (!catalogTable.getIsreplicated())
        {
            restore_plan = generatePartitionedToPartitionedPlan(st);
        }
        else
        {
            restore_plan = generatePartitionedToReplicatedPlan(st);
        }
        return restore_plan;
    }

    private void checkSiteConsistency(VoltTableRow row) throws IOException
    {
        if (!row.getString("IS_REPLICATED").equals("FALSE"))
        {
            String error = "Table: " + getTableName() + " was partitioned " +
            "but has a savefile which indicates replication at site: " +
            row.getLong("CURRENT_HOST_ID");
            m_consistencyResult = error;
            throw new IOException(error);
        }

        if ((int) row.getLong("TOTAL_PARTITIONS") != getTotalPartitions())
        {
            String error = "Table: " + getTableName() + " has a savefile " +
            " with an inconsistent number of total partitions: " +
            row.getLong("TOTAL_PARTITIONS") + " (previous values were " +
            getTotalPartitions() + ") at site: " +
            row.getLong("CURRENT_HOST_ID");
            m_consistencyResult = error;
            throw new IOException(error);
        }
    }

    private SynthesizedPlanFragment[] generatePartitionedToReplicatedPlan(SiteTracker st) {
        ArrayList<SynthesizedPlanFragment> restorePlan = new ArrayList<SynthesizedPlanFragment>();
        Set<Integer> coveredPartitions = new HashSet<Integer>();

        Iterator<Entry<Integer, Set<Pair<Integer, Integer>>>> partitionAtHostItr =
                m_partitionsAtHost.entrySet().iterator();

        // looping through all current hosts having .vpt files of this table
        while(partitionAtHostItr.hasNext()) {
            Entry<Integer, Set<Pair<Integer, Integer>>> partitionAtHost = partitionAtHostItr.next();
            Integer host = partitionAtHost.getKey();
            List<Integer> loadPartitions = new ArrayList<Integer>();
            List<Integer> loadOrigHosts = new ArrayList<Integer>();
            Set<Pair<Integer, Integer>> partitionAndOrigHostSet = partitionAtHost.getValue();
            Iterator<Pair<Integer, Integer>> itr = partitionAndOrigHostSet.iterator();

            // calculate which available partitions not yet been covered and put
            // its partition_id and orig_host_id in loadPartitions and loadOrigHosts
            while(itr.hasNext()) {
                Pair<Integer, Integer> pair = itr.next();
                if(!coveredPartitions.contains(pair.getFirst())) {
                    loadPartitions.add(pair.getFirst());
                    loadOrigHosts.add(pair.getSecond());
                    coveredPartitions.add(pair.getFirst());
                }
            }

            // if there are some work to do
            if(loadPartitions.size() > 0){
                int[] relevantPartitionIds = com.google_voltpatches.common.primitives.Ints.toArray(loadPartitions);
                int[] originalHosts = com.google_voltpatches.common.primitives.Ints.toArray(loadOrigHosts);
                List<Long> sitesAtHost = st.getSitesForHost(host);

                // for each site of this host, generate one work fragment and let them execute in parallel
                for(Long site : sitesAtHost) {
                    restorePlan.add(constructDistributePartitionedTableFragment(
                            site, relevantPartitionIds, originalHosts, true));
                }
            }
        }
        restorePlan.add(constructDistributePartitionedTableAggregatorFragment(true));
        assert(coveredPartitions.size() == m_partitionsSeen.size());
        return restorePlan.toArray(new SynthesizedPlanFragment[0]);
    }

    private SynthesizedPlanFragment[] generatePartitionedToPartitionedPlan(SiteTracker st) {
        LOG.info("Partition set: " + m_partitionsSeen);
        ArrayList<SynthesizedPlanFragment> restorePlan = new ArrayList<SynthesizedPlanFragment>();
        HashSet<Integer> coveredPartitions = new HashSet<Integer>();

        HashMap<Integer, ArrayList<Integer>> hostsToUncoveredPartitions = new HashMap<Integer, ArrayList<Integer>>();
        HashMap<Integer, ArrayList<Integer>> hostsToOriginalHosts = new HashMap<Integer, ArrayList<Integer>>();

        for (Integer host : m_partitionsAtHost.keySet()) {
            hostsToUncoveredPartitions.put(host, new ArrayList<Integer>());
            hostsToOriginalHosts.put(host, new ArrayList<Integer>());
        }

        /*
         * Loop through the list of hosts repeatedly. Each time pick only one
         * partition to distribute from each host. This ensures some load
         * balancing.
         */
        while (!coveredPartitions.containsAll(m_partitionsSeen)) {
            Iterator<Integer> hosts = m_partitionsAtHost.keySet().iterator();
            // Track if progress was made, if nothing to distribute
            // was found and we aren't covering then it is missing partitions
            int numPartitionsUsed = 0;
            while (hosts.hasNext()) {
                /**
                 * Get the list of partitions on this host and remove all that
                 * were covered already
                 */
                Integer nextHost = hosts.next();
                Set<Pair<Integer, Integer>> partitionsAndOrigHosts = new HashSet<Pair<Integer, Integer>>(
                        m_partitionsAtHost.get(nextHost));
                Iterator<Pair<Integer, Integer>> removeCoveredIterator = partitionsAndOrigHosts
                        .iterator();

                List<Integer> uncoveredPartitionsAtHostList = hostsToUncoveredPartitions
                        .get(nextHost);
                ArrayList<Integer> originalHosts = hostsToOriginalHosts
                        .get(nextHost);
                while (removeCoveredIterator.hasNext()) {
                    Pair<Integer, Integer> p = removeCoveredIterator.next();
                    if (coveredPartitions.contains(p.getFirst())) {
                        removeCoveredIterator.remove();
                    }
                }

                /*
                 * If there is a partition left that isn't covered select it for
                 * distribution
                 */
                Iterator<Pair<Integer, Integer>> candidatePartitions = partitionsAndOrigHosts
                        .iterator();
                if (candidatePartitions.hasNext()) {
                    Pair<Integer, Integer> p = candidatePartitions.next();
                    coveredPartitions.add(p.getFirst());
                    uncoveredPartitionsAtHostList.add(p.getFirst());
                    originalHosts.add(p.getSecond());
                    numPartitionsUsed++;
                }
            }
            if (numPartitionsUsed == 0
                    && !coveredPartitions.containsAll(m_partitionsSeen)) {
                LOG.error("Could not find a host to distribute some partitions");
                return null;
            }
        }

        SNAP_LOG.info("Distribution plan for table " + getTableName());
        for (Integer host : m_partitionsAtHost.keySet()) {
            List<Integer> uncoveredPartitionsAtHostList = hostsToUncoveredPartitions
                    .get(host);
            ArrayList<Integer> originalHosts = hostsToOriginalHosts.get(host);

            List<Long> sitesAtHost = VoltDB.instance().getSiteTrackerForSnapshot()
                    .getSitesForHost(host);

            int originalHostsArray[] = new int[originalHosts.size()];
            int qq = 0;
            for (int originalHostId : originalHosts)
                originalHostsArray[qq++] = originalHostId;
            int uncoveredPartitionsAtHost[] = new int[uncoveredPartitionsAtHostList
                    .size()];
            for (int ii = 0; ii < uncoveredPartitionsAtHostList.size(); ii++) {
                uncoveredPartitionsAtHost[ii] = uncoveredPartitionsAtHostList
                        .get(ii);
            }

            StringBuilder sb = new StringBuilder();
            sb.append("\tHost ").append(host)
                    .append(" will distribute partitions ");
            for (Integer partition : uncoveredPartitionsAtHostList) {
                sb.append(partition).append(' ');
            }
            SNAP_LOG.info(sb.toString());

            /*
             * Assigning the FULL workload to each site. At the actual host
             * static synchronization in the procedure will ensure the work is
             * distributed across every ES in a meaningful way.
             */
            for (Long site : sitesAtHost) {
                restorePlan.add(constructDistributePartitionedTableFragment(
                        site, uncoveredPartitionsAtHost, originalHostsArray, false));
            }
        }
        restorePlan
                .add(constructDistributePartitionedTableAggregatorFragment(false));
        return restorePlan.toArray(new SynthesizedPlanFragment[0]);
    }

    private SynthesizedPlanFragment
    constructDistributePartitionedTableFragment(
            long distributorSiteId,     // site which will execute this plan fragment
            int uncoveredPartitionsAtHost[],    // which partitions' data in the .vpt files will be extracted as TableSaveFile
            int originalHostsArray[],           // used to locate .vpt files
            boolean asReplicated)
    {
        int result_dependency_id = getNextDependencyId();
        SynthesizedPlanFragment plan_fragment = new SynthesizedPlanFragment();
        plan_fragment.fragmentId =
                (asReplicated ? SysProcFragmentId.PF_restoreDistributePartitionedTableAsReplicated
                              : SysProcFragmentId.PF_restoreDistributePartitionedTableAsPartitioned);
        plan_fragment.multipartition = false;
        plan_fragment.siteId = distributorSiteId;
        plan_fragment.outputDepId = result_dependency_id;
        plan_fragment.inputDepIds = new int[] {};
        addPlanDependencyId(result_dependency_id);
        plan_fragment.parameters = ParameterSet.fromArrayNoCopy(
                getTableName(),
                originalHostsArray,
                uncoveredPartitionsAtHost,
                result_dependency_id);
        return plan_fragment;
    }

    private SynthesizedPlanFragment
    constructDistributePartitionedTableAggregatorFragment(boolean asReplicated)
    {
        int result_dependency_id = getNextDependencyId();
        SynthesizedPlanFragment plan_fragment = new SynthesizedPlanFragment();
        plan_fragment.fragmentId =
            SysProcFragmentId.PF_restoreReceiveResultTables;
        plan_fragment.multipartition = false;
        plan_fragment.outputDepId = result_dependency_id;
        plan_fragment.inputDepIds = getPlanDependencyIds();
        setRootDependencyId(result_dependency_id);
        plan_fragment.parameters = ParameterSet.fromArrayNoCopy(
                result_dependency_id,
                (asReplicated ?
                        "Aggregating partitioned-to-replicated table restore results"
                        : "Aggregating partitioned table restore results"));
        return plan_fragment;
    }

    // XXX-BLAH should this move to SiteTracker?
    public Set<Pair<Integer, Integer>> getPartitionsAtHost(int hostId) {
        return m_partitionsAtHost.get(hostId);
    }

    Set<Integer> getPartitionSet()
    {
        return m_partitionsSeen;
    }

    /**
     * Set of original PartitionId
     */
    private final TreeSet<Integer> m_partitionsSeen =
          new TreeSet<Integer>();

    /**
     * Map from a current host id to a pair of an original
     * partition id and the original host id
     */
    private final Map<Integer, Set<Pair<Integer, Integer>>> m_partitionsAtHost =
        new HashMap<Integer, Set<Pair<Integer, Integer>>>();
    private int m_totalPartitions = 0;
}
TOP

Related Classes of org.voltdb.sysprocs.saverestore.PartitionedTableSaveFileState

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.