Source Code of org.elasticsearch.discovery.DiscoveryWithServiceDisruptions

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.elasticsearch.discovery;


import com.google.common.base.Predicate;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlock;
import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.operation.hash.djb.DjbHashFunction;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.ZenDiscovery;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.fd.FaultDetection;
import org.elasticsearch.discovery.zen.membership.MembershipAction;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing;
import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.discovery.ClusterDiscoveryConfiguration;
import org.elasticsearch.test.disruption.*;
import org.elasticsearch.test.junit.annotations.TestLogging;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.*;
import org.junit.Before;
import org.junit.Test;


import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;


import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.hamcrest.Matchers.*;


/**
 */
@LuceneTestCase.Slow
@TestLogging("discovery.zen:TRACE")
@ClusterScope(scope = Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
public class DiscoveryWithServiceDisruptions extends ElasticsearchIntegrationTest {


    private static final TimeValue DISRUPTION_HEALING_OVERHEAD = TimeValue.timeValueSeconds(40); // we use 30s as timeout in many places.


    private ClusterDiscoveryConfiguration discoveryConfig;




    @Override
    protected Settings nodeSettings(int nodeOrdinal) {
        return discoveryConfig.node(nodeOrdinal);
    }


    @Before
    public void clearConfig() {
        discoveryConfig = null;
    }


    @Override
    protected int numberOfShards() {
        return 3;
    }


    @Override
    protected int numberOfReplicas() {
        return 1;
    }


    private List<String> startCluster(int numberOfNodes) throws ExecutionException, InterruptedException {
        return startCluster(numberOfNodes, -1);
    }


    private List<String> startCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
        configureCluster(numberOfNodes, minimumMasterNode);
        List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
        ensureStableCluster(numberOfNodes);


        // TODO: this is a temporary solution so that nodes will not base their reaction to a partition based on previous successful results
        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
            for (ZenPing zenPing : pingService.zenPings()) {
                if (zenPing instanceof UnicastZenPing) {
                    ((UnicastZenPing) zenPing).clearTemporalResponses();
                }
            }
        }
        return nodes;
    }




    private List<String> startUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
        configureUnicastCluster(numberOfNodes, unicastHostsOrdinals, minimumMasterNode);
        List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
        ensureStableCluster(numberOfNodes);


        // TODO: this is a temporary solution so that nodes will not base their reaction to a partition based on previous successful results
        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
            for (ZenPing zenPing : pingService.zenPings()) {
                if (zenPing instanceof UnicastZenPing) {
                    ((UnicastZenPing) zenPing).clearTemporalResponses();
                }
            }
        }
        return nodes;
    }


    final static Settings DEFAULT_SETTINGS = ImmutableSettings.builder()
            .put(FaultDetection.SETTING_PING_TIMEOUT, "1s") // for hitting simulated network failures quickly
            .put(FaultDetection.SETTING_PING_RETRIES, "1") // for hitting simulated network failures quickly
            .put("discovery.zen.join_timeout", "10s")  // still long to induce failures but to long so test won't time out
            .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
            .put("http.enabled", false) // just to make test quicker
            .put("gateway.local.list_timeout", "10s") // still long to induce failures but to long so test won't time out
            .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
            .build();


    private void configureCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
        if (randomBoolean()) {
            configureMulticastCluster(numberOfNodes, minimumMasterNode);
        } else {
            configureUnicastCluster(numberOfNodes, null, minimumMasterNode);
        }


    }


    private void configureMulticastCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
        if (minimumMasterNode < 0) {
            minimumMasterNode = numberOfNodes / 2 + 1;
        }
        // TODO: Rarely use default settings form some of these
        Settings settings = ImmutableSettings.builder()
                .put(DEFAULT_SETTINGS)
                .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
                .build();


        if (discoveryConfig == null) {
            discoveryConfig = new ClusterDiscoveryConfiguration(numberOfNodes, settings);
        }
    }


    private void configureUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
        if (minimumMasterNode < 0) {
            minimumMasterNode = numberOfNodes / 2 + 1;
        }
        // TODO: Rarely use default settings form some of these
        Settings nodeSettings = ImmutableSettings.builder()
                .put(DEFAULT_SETTINGS)
                .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
                .build();


        if (discoveryConfig == null) {
            if (unicastHostsOrdinals == null) {
                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, nodeSettings);
            } else {
                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, nodeSettings, unicastHostsOrdinals);
            }
        }
    }




    /**
     * Test that no split brain occurs under partial network partition. See https://github.com/elasticsearch/elasticsearch/issues/2488
     *
     * @throws Exception
     */
    @Test
    public void failWithMinimumMasterNodesConfigured() throws Exception {


        List<String> nodes = startCluster(3);


        // Figure out what is the elected master node
        final String masterNode = internalCluster().getMasterName();
        logger.info("---> legit elected master node=" + masterNode);


        // Pick a node that isn't the elected master.
        Set<String> nonMasters = new HashSet<>(nodes);
        nonMasters.remove(masterNode);
        final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY));




        // Simulate a network issue between the unlucky node and elected master node in both directions.


        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterNode, unluckyNode, getRandom());
        setDisruptionScheme(networkDisconnect);
        networkDisconnect.startDisrupting();


        // Wait until elected master has removed that the unlucky node...
        ensureStableCluster(2, masterNode);


        // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
        // continuously ping until network failures have been resolved. However
        // It may a take a bit before the node detects it has been cut off from the elected master
        assertNoMaster(unluckyNode);


        networkDisconnect.stopDisrupting();


        // Wait until the master node sees all 3 nodes again.
        ensureStableCluster(3);


        // The elected master shouldn't have changed, since the unlucky node never could have elected himself as
        // master since m_m_n of 2 could never be satisfied.
        assertMaster(masterNode, nodes);
    }




    /** Verify that nodes fault detection works after master (re) election */
    @Test
    @TestLogging(value = "cluster.service:TRACE,indices.recovery:TRACE")
    public void testNodesFDAfterMasterReelection() throws Exception {
        startCluster(3);


        logger.info("stopping current master");
        internalCluster().stopCurrentMasterNode();


        ensureStableCluster(2);


        String master = internalCluster().getMasterName();
        String nonMaster = null;
        for (String node : internalCluster().getNodeNames()) {
            if (!node.equals(master)) {
                nonMaster = node;
            }
        }


        logger.info("--> isolating [{}]", nonMaster);
        addRandomIsolation(nonMaster).startDisrupting();


        logger.info("--> waiting for master to remove it");
        ensureStableCluster(1, master);
    }


    /**
     * Verify that the proper block is applied when nodes loose their master
     */
    @Test
    @TestLogging(value = "cluster.service:TRACE,indices.recovery:TRACE")
    public void testVerifyApiBlocksDuringPartition() throws Exception {
        startCluster(3);


        // Makes sure that the get request can be executed on each node locally:
        assertAcked(prepareCreate("test").setSettings(ImmutableSettings.builder()
                        .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
                        .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
        ));


        // Everything is stable now, it is now time to simulate evil...
        // but first make sure we have no initializing shards and all is green
        // (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
        ensureGreen("test");


        NetworkPartition networkPartition = addRandomPartition();


        final String isolatedNode = networkPartition.getMinoritySide().get(0);
        final String nonIsolatedNode = networkPartition.getMajoritySide().get(0);


        // Simulate a network issue between the unlucky node and the rest of the cluster.
        networkPartition.startDisrupting();




        // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
        // continuously ping until network failures have been resolved. However
        // It may a take a bit before the node detects it has been cut off from the elected master
        logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
        assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(10));




        logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
        ensureStableCluster(2, nonIsolatedNode);


        for (String node : networkPartition.getMajoritySide()) {
            ClusterState nodeState = getNodeClusterState(node);
            boolean success = true;
            if (nodeState.nodes().getMasterNode() == null) {
                success = false;
            }
            if (!nodeState.blocks().global().isEmpty()) {
                success = false;
            }
            if (!success) {
                fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n"
                        + nodeState.prettyPrint());
            }
        }




        networkPartition.stopDisrupting();


        // Wait until the master node sees al 3 nodes again.
        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkPartition.expectedTimeToHeal().millis()));


        logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK, "all");
        client().admin().cluster().prepareUpdateSettings()
                .setTransientSettings(ImmutableSettings.builder().put(DiscoverySettings.NO_MASTER_BLOCK, "all"))
                .get();


        networkPartition.startDisrupting();




        // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
        // continuously ping until network failures have been resolved. However
        // It may a take a bit before the node detects it has been cut off from the elected master
        logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
        assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(10));


        // make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
        // the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
        // the test to fail due to unfreed resources
        ensureStableCluster(2, nonIsolatedNode);


    }


    /**
     * This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
     * and verifies that all node agree on the new cluster state
     */
    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
    public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
        final List<String> nodes = startCluster(3);


        assertAcked(prepareCreate("test")
                .setSettings(ImmutableSettings.builder()
                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
                ));


        ensureGreen();
        String isolatedNode = internalCluster().getMasterName();
        NetworkPartition networkPartition = addRandomIsolation(isolatedNode);
        networkPartition.startDisrupting();


        String nonIsolatedNode = networkPartition.getMajoritySide().get(0);


        // make sure cluster reforms
        ensureStableCluster(2, nonIsolatedNode);


        // make sure isolated need picks up on things.
        assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));


        // restore isolation
        networkPartition.stopDisrupting();


        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkPartition.expectedTimeToHeal().millis()));


        logger.info("issue a reroute");
        // trigger a reroute now, instead of waiting for the background reroute of RerouteService
        assertAcked(client().admin().cluster().prepareReroute());
        // and wait for it to finish and for the cluster to stabilize
        ensureGreen("test");


        // verify all cluster states are the same
        ClusterState state = null;
        for (String node : nodes) {
            ClusterState nodeState = getNodeClusterState(node);
            if (state == null) {
                state = nodeState;
                continue;
            }
            // assert nodes are identical
            try {
                assertEquals("unequal versions", state.version(), nodeState.version());
                assertEquals("unequal node count", state.nodes().size(), nodeState.nodes().size());
                assertEquals("different masters ", state.nodes().masterNodeId(), nodeState.nodes().masterNodeId());
                assertEquals("different meta data version", state.metaData().version(), nodeState.metaData().version());
                if (!state.routingTable().prettyPrint().equals(nodeState.routingTable().prettyPrint())) {
                    fail("different routing");
                }
            } catch (AssertionError t) {
                fail("failed comparing cluster state: " + t.getMessage() + "\n" +
                        "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state.prettyPrint() +
                        "\n--- cluster state [" + node + "]: ---\n" + nodeState.prettyPrint());
            }


        }
    }


    /**
     * Test that we do not loose document whose indexing request was successful, under a randomly selected disruption scheme
     * We also collect & report the type of indexing failures that occur.
     *
     * This test is a superset of tests run in the Jepsen test suite, with the exception of versioned updates
     */
    @Test
    // NOTE: if you remove the awaitFix, make sure to port the test to the 1.x branch
    @LuceneTestCase.AwaitsFix(bugUrl = "needs some more work to stabilize")
    @TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
    public void testAckedIndexing() throws Exception {
        // TODO: add node count randomizaion
        final List<String> nodes = startCluster(3);


        assertAcked(prepareCreate("test")
                .setSettings(ImmutableSettings.builder()
                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
                ));
        ensureGreen();


        ServiceDisruptionScheme disruptionScheme = addRandomDisruptionScheme();
        logger.info("disruption scheme [{}] added", disruptionScheme);


        final ConcurrentHashMap<String, String> ackedDocs = new ConcurrentHashMap<>(); // id -> node sent.


        final AtomicBoolean stop = new AtomicBoolean(false);
        List<Thread> indexers = new ArrayList<>(nodes.size());
        List<Semaphore> semaphores = new ArrayList<>(nodes.size());
        final AtomicInteger idGenerator = new AtomicInteger(0);
        final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
        final List<Exception> exceptedExceptions = Collections.synchronizedList(new ArrayList<Exception>());


        logger.info("starting indexers");
        try {
            for (final String node : nodes) {
                final Semaphore semaphore = new Semaphore(0);
                semaphores.add(semaphore);
                final Client client = client(node);
                final String name = "indexer_" + indexers.size();
                final int numPrimaries = getNumShards("test").numPrimaries;
                Thread thread = new Thread(new Runnable() {
                    @Override
                    public void run() {
                        while (!stop.get()) {
                            String id = null;
                            try {
                                if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
                                    continue;
                                }
                                logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
                                try {
                                    id = Integer.toString(idGenerator.incrementAndGet());
                                    int shard = ((InternalTestCluster) cluster()).getInstance(DjbHashFunction.class).hash(id) % numPrimaries;
                                    logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
                                    IndexResponse response = client.prepareIndex("test", "type", id).setSource("{}").setTimeout("1s").get();
                                    assertThat(response.getVersion(), equalTo(1l));
                                    ackedDocs.put(id, node);
                                    logger.trace("[{}] indexed id [{}] through node [{}]", name, id, node);
                                } catch (ElasticsearchException e) {
                                    exceptedExceptions.add(e);
                                    logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
                                } finally {
                                    countDownLatchRef.get().countDown();
                                    logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
                                }
                            } catch (InterruptedException e) {
                                // fine - semaphore interrupt
                            } catch (Throwable t) {
                                logger.info("unexpected exception in background thread of [{}]", t, node);
                            }
                        }
                    }
                });


                thread.setName(name);
                thread.setDaemon(true);
                thread.start();
                indexers.add(thread);
            }


            int docsPerIndexer = randomInt(3);
            logger.info("indexing " + docsPerIndexer + " docs per indexer before partition");
            countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
            for (Semaphore semaphore : semaphores) {
                semaphore.release(docsPerIndexer);
            }
            assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));


            for (int iter = 1 + randomInt(2); iter > 0; iter--) {
                logger.info("starting disruptions & indexing (iteration [{}])", iter);
                disruptionScheme.startDisrupting();


                docsPerIndexer = 1 + randomInt(5);
                logger.info("indexing " + docsPerIndexer + " docs per indexer during partition");
                countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
                Collections.shuffle(semaphores);
                for (Semaphore semaphore : semaphores) {
                    assertThat(semaphore.availablePermits(), equalTo(0));
                    semaphore.release(docsPerIndexer);
                }
                assertTrue(countDownLatchRef.get().await(60000 + disruptionScheme.expectedTimeToHeal().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));


                logger.info("stopping disruption");
                disruptionScheme.stopDisrupting();
                ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + DISRUPTION_HEALING_OVERHEAD.millis()));
                ensureGreen("test");


                logger.info("validating successful docs");
                for (String node : nodes) {
                    try {
                        logger.debug("validating through node [{}]", node);
                        for (String id : ackedDocs.keySet()) {
                            assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found",
                                    client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists());
                        }
                    } catch (AssertionError e) {
                        throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
                    }
                }


                logger.info("done validating (iteration [{}])", iter);
            }
        } finally {
            if (exceptedExceptions.size() > 0) {
                StringBuilder sb = new StringBuilder("Indexing exceptions during disruption:");
                for (Exception e : exceptedExceptions) {
                    sb.append("\n").append(e.getMessage());
                }
                logger.debug(sb.toString());
            }
            logger.info("shutting down indexers");
            stop.set(true);
            for (Thread indexer : indexers) {
                indexer.interrupt();
                indexer.join(60000);
            }
        }
    }


    /**
     * Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
     */
    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
    public void testMasterNodeGCs() throws Exception {
        // TODO: on mac OS multicast threads are shared between nodes and we therefore we can't simulate GC and stop pinging for just one node
        // find a way to block thread creation in the generic thread pool to avoid this.
        List<String> nodes = startUnicastCluster(3, null, -1);


        String oldMasterNode = internalCluster().getMasterName();
        // a very long GC, but it's OK as we remove the disruption when it has had an effect
        SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(oldMasterNode, getRandom(), 100, 200, 30000, 60000);
        internalCluster().setDisruptionScheme(masterNodeDisruption);
        masterNodeDisruption.startDisrupting();


        Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
        oldNonMasterNodesSet.remove(oldMasterNode);


        List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);


        logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
        for (String node : oldNonMasterNodesSet) {
            assertDifferentMaster(node, oldMasterNode);
        }


        logger.info("waiting for nodes to elect a new master");
        ensureStableCluster(2, oldNonMasterNodes.get(0));


        logger.info("waiting for any pinging to stop");
        for (final String node : oldNonMasterNodes) {
            assertTrue("node [" + node + "] is still joining master", awaitBusy(new Predicate<Object>() {
                @Override
                public boolean apply(Object input) {
                    return !((ZenDiscovery) internalCluster().getInstance(Discovery.class, node)).joiningCluster();
                }
            }, 30, TimeUnit.SECONDS));
        }


        // restore GC
        masterNodeDisruption.stopDisrupting();
        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis()),
                oldNonMasterNodes.get(0));


        // make sure all nodes agree on master
        String newMaster = internalCluster().getMasterName();
        assertThat(newMaster, not(equalTo(oldMasterNode)));
        assertMaster(newMaster, nodes);
    }


    /**
     * Test that a document which is indexed on the majority side of a partition, is available from the minory side,
     * once the partition is healed
     *
     * @throws Exception
     */
    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
    public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
        List<String> nodes = startCluster(3);


        assertAcked(prepareCreate("test")
                .setSettings(ImmutableSettings.builder()
                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
                )
                .get());
        ensureGreen("test");


        nodes = new ArrayList<>(nodes);
        Collections.shuffle(nodes, getRandom());
        String isolatedNode = nodes.get(0);
        String notIsolatedNode = nodes.get(1);


        ServiceDisruptionScheme scheme = addRandomIsolation(isolatedNode);
        scheme.startDisrupting();
        ensureStableCluster(2, notIsolatedNode);
        assertFalse(client(notIsolatedNode).admin().cluster().prepareHealth("test").setWaitForYellowStatus().get().isTimedOut());




        IndexResponse indexResponse = internalCluster().client(notIsolatedNode).prepareIndex("test", "type").setSource("field", "value").get();
        assertThat(indexResponse.getVersion(), equalTo(1l));


        logger.info("Verifying if document exists via node[" + notIsolatedNode + "]");
        GetResponse getResponse = internalCluster().client(notIsolatedNode).prepareGet("test", "type", indexResponse.getId())
                .setPreference("_local")
                .get();
        assertThat(getResponse.isExists(), is(true));
        assertThat(getResponse.getVersion(), equalTo(1l));
        assertThat(getResponse.getId(), equalTo(indexResponse.getId()));


        scheme.stopDisrupting();


        ensureStableCluster(3);
        ensureGreen("test");


        for (String node : nodes) {
            logger.info("Verifying if document exists after isolating node[" + isolatedNode + "] via node[" + node + "]");
            getResponse = internalCluster().client(node).prepareGet("test", "type", indexResponse.getId())
                    .setPreference("_local")
                    .get();
            assertThat(getResponse.isExists(), is(true));
            assertThat(getResponse.getVersion(), equalTo(1l));
            assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
        }
    }


    /**
     * A 4 node cluster with m_m_n set to 3 and each node has one unicast enpoint. One node partitions from the master node.
     * The temporal unicast responses is empty. When partition is solved the one ping response contains a master node.
     * The rejoining node should take this master node and connect.
     */
    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE")
    public void unicastSinglePingResponseContainsMaster() throws Exception {
        List<String> nodes = startUnicastCluster(4, new int[]{0}, -1);
        // Figure out what is the elected master node
        final String masterNode = internalCluster().getMasterName();
        logger.info("---> legit elected master node=" + masterNode);
        List<String> otherNodes = new ArrayList<>(nodes);
        otherNodes.remove(masterNode);
        otherNodes.remove(nodes.get(0)); // <-- Don't isolate the node that is in the unicast endpoint for all the other nodes.
        final String isolatedNode = otherNodes.get(0);


        // Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
        // includes all the other nodes that have pinged it and the issue doesn't manifest
        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
            for (ZenPing zenPing : pingService.zenPings()) {
                ((UnicastZenPing) zenPing).clearTemporalResponses();
            }
        }


        // Simulate a network issue between the unlucky node and elected master node in both directions.
        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterNode, isolatedNode, getRandom());
        setDisruptionScheme(networkDisconnect);
        networkDisconnect.startDisrupting();
        // Wait until elected master has removed that the unlucky node...
        ensureStableCluster(3, masterNode);


        // The isolate master node must report no master, so it starts with pinging
        assertNoMaster(isolatedNode);
        networkDisconnect.stopDisrupting();
        // Wait until the master node sees all 4 nodes again.
        ensureStableCluster(4);
        // The elected master shouldn't have changed, since the isolated node never could have elected himself as
        // master since m_m_n of 3 could never be satisfied.
        assertMaster(masterNode, nodes);
    }


    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE")
    public void isolatedUnicastNodes() throws Exception {
        List<String> nodes = startUnicastCluster(3, new int[]{0}, -1);
        // Figure out what is the elected master node
        final String unicastTarget = nodes.get(0);


        Set<String> unicastTargetSide = new HashSet<>();
        unicastTargetSide.add(unicastTarget);


        Set<String> restOfClusterSide = new HashSet<>();
        restOfClusterSide.addAll(nodes);
        restOfClusterSide.remove(unicastTarget);


        // Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
        // includes all the other nodes that have pinged it and the issue doesn't manifest
        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
            for (ZenPing zenPing : pingService.zenPings()) {
                ((UnicastZenPing) zenPing).clearTemporalResponses();
            }
        }


        // Simulate a network issue between the unicast target node and the rest of the cluster
        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(unicastTargetSide, restOfClusterSide, getRandom());
        setDisruptionScheme(networkDisconnect);
        networkDisconnect.startDisrupting();
        // Wait until elected master has removed that the unlucky node...
        ensureStableCluster(2, nodes.get(1));


        // The isolate master node must report no master, so it starts with pinging
        assertNoMaster(unicastTarget);
        networkDisconnect.stopDisrupting();
        // Wait until the master node sees all 3 nodes again.
        ensureStableCluster(3);
    }




    /** Test cluster join with issues in cluster state publishing * */
    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE")
    public void testClusterJoinDespiteOfPublishingIssues() throws Exception {
        List<String> nodes = startCluster(2, 1);


        String masterNode = internalCluster().getMasterName();
        String nonMasterNode;
        if (masterNode.equals(nodes.get(0))) {
            nonMasterNode = nodes.get(1);
        } else {
            nonMasterNode = nodes.get(0);
        }


        DiscoveryNodes discoveryNodes = internalCluster().getInstance(ClusterService.class, nonMasterNode).state().nodes();


        logger.info("blocking requests from non master [{}] to master [{}]", nonMasterNode, masterNode);
        MockTransportService nonMasterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, nonMasterNode);
        nonMasterTransportService.addFailToSendNoConnectRule(discoveryNodes.masterNode());


        assertNoMaster(nonMasterNode);


        logger.info("blocking cluster state publishing from master [{}] to non master [{}]", masterNode, nonMasterNode);
        MockTransportService masterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, masterNode);
        masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), PublishClusterStateAction.ACTION_NAME);


        logger.info("allowing requests from non master [{}] to master [{}], waiting for two join request", nonMasterNode, masterNode);
        final CountDownLatch countDownLatch = new CountDownLatch(2);
        nonMasterTransportService.addDelegate(discoveryNodes.masterNode(), new MockTransportService.DelegateTransport(nonMasterTransportService.original()) {
            @Override
            public void sendRequest(DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException {
                if (action.equals(MembershipAction.DISCOVERY_JOIN_ACTION_NAME)) {
                    countDownLatch.countDown();
                }
                super.sendRequest(node, requestId, action, request, options);
            }
        });


        countDownLatch.await();


        logger.info("waiting for cluster to reform");
        masterTransportService.clearRule(discoveryNodes.localNode());
        nonMasterTransportService.clearRule(discoveryNodes.masterNode());


        ensureStableCluster(2);
    }




    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE")
    public void testClusterFormingWithASlowNode() throws Exception {
        configureCluster(3, 2);


        SlowClusterStateProcessing disruption = new SlowClusterStateProcessing(getRandom(), 0, 0, 5000, 6000);


        // don't wait for initial state, wat want to add the disruption while the cluster is forming..
        internalCluster().startNodesAsync(3,
                ImmutableSettings.builder()
                        .put(DiscoveryService.SETTING_INITIAL_STATE_TIMEOUT, "1ms")
                        .put(DiscoverySettings.PUBLISH_TIMEOUT, "3s")
                        .build()).get();


        logger.info("applying disruption while cluster is forming ...");


        internalCluster().setDisruptionScheme(disruption);
        disruption.startDisrupting();


        ensureStableCluster(3);
    }


    /**
     * Adds an asymetric break between a master and one of the nodes and makes
     * sure that the node is removed form the cluster, that the node start pinging and that
     * the cluster reforms when healed.
     */
    @Test
    @TestLogging("discovery.zen:TRACE,action:TRACE")
    public void testNodeNotReachableFromMaster() throws Exception {
        startCluster(3);


        String masterNode = internalCluster().getMasterName();
        String nonMasterNode = null;
        while (nonMasterNode == null) {
            nonMasterNode = randomFrom(internalCluster().getNodeNames());
            if (nonMasterNode.equals(masterNode)) {
                nonMasterNode = null;
            }
        }


        logger.info("blocking request from master [{}] to [{}]", masterNode, nonMasterNode);
        MockTransportService masterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, masterNode);
        if (randomBoolean()) {
            masterTransportService.addUnresponsiveRule(internalCluster().getInstance(ClusterService.class, nonMasterNode).localNode());
        } else {
            masterTransportService.addFailToSendNoConnectRule(internalCluster().getInstance(ClusterService.class, nonMasterNode).localNode());
        }


        logger.info("waiting for [{}] to be removed from cluster", nonMasterNode);
        ensureStableCluster(2, masterNode);


        logger.info("waiting for [{}] to have no master", nonMasterNode);
        assertNoMaster(nonMasterNode);


        logger.info("healing partition and checking cluster reforms");
        masterTransportService.clearAllRules();


        ensureStableCluster(3);
    }




    protected NetworkPartition addRandomPartition() {
        NetworkPartition partition;
        if (randomBoolean()) {
            partition = new NetworkUnresponsivePartition(getRandom());
        } else {
            partition = new NetworkDisconnectPartition(getRandom());
        }


        setDisruptionScheme(partition);


        return partition;
    }


    protected NetworkPartition addRandomIsolation(String isolatedNode) {
        Set<String> side1 = new HashSet<>();
        Set<String> side2 = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
        side1.add(isolatedNode);
        side2.remove(isolatedNode);


        NetworkPartition partition;
        if (randomBoolean()) {
            partition = new NetworkUnresponsivePartition(side1, side2, getRandom());
        } else {
            partition = new NetworkDisconnectPartition(side1, side2, getRandom());
        }


        internalCluster().setDisruptionScheme(partition);


        return partition;
    }


    private ServiceDisruptionScheme addRandomDisruptionScheme() {
        // TODO: add partial partitions
        List<ServiceDisruptionScheme> list = Arrays.asList(
                new NetworkUnresponsivePartition(getRandom()),
                new NetworkDelaysPartition(getRandom()),
                new NetworkDisconnectPartition(getRandom()),
                new SlowClusterStateProcessing(getRandom())
        );
        Collections.shuffle(list);
        setDisruptionScheme(list.get(0));
        return list.get(0);
    }


    private void ensureStableCluster(int nodeCount) {
        ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), null);
    }


    private void ensureStableCluster(int nodeCount, TimeValue timeValue) {
        ensureStableCluster(nodeCount, timeValue, null);
    }


    private void ensureStableCluster(int nodeCount, @Nullable String viaNode) {
        ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), viaNode);
    }


    private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable String viaNode) {
        if (viaNode == null) {
            viaNode = randomFrom(internalCluster().getNodeNames());
        }
        logger.debug("ensuring cluster is stable with [{}] nodes. access node: [{}]. timeout: [{}]", nodeCount, viaNode, timeValue);
        ClusterHealthResponse clusterHealthResponse = client(viaNode).admin().cluster().prepareHealth()
                .setWaitForEvents(Priority.LANGUID)
                .setWaitForNodes(Integer.toString(nodeCount))
                .setTimeout(timeValue)
                .setWaitForRelocatingShards(0)
                .get();
        if (clusterHealthResponse.isTimedOut()) {
            ClusterStateResponse stateResponse = client(viaNode).admin().cluster().prepareState().get();
            fail("failed to reach a stable cluster of [" + nodeCount + "] nodes. Tried via [" + viaNode + "]. last cluster state:\n"
                    + stateResponse.getState().prettyPrint());
        }
        assertThat(clusterHealthResponse.isTimedOut(), is(false));
    }


    private ClusterState getNodeClusterState(String node) {
        return client(node).admin().cluster().prepareState().setLocal(true).get().getState();
    }


    private void assertNoMaster(final String node) throws Exception {
        assertNoMaster(node, null, TimeValue.timeValueSeconds(10));
    }


    private void assertNoMaster(final String node, TimeValue maxWaitTime) throws Exception {
        assertNoMaster(node, null, maxWaitTime);
    }


    private void assertNoMaster(final String node, @Nullable final ClusterBlock expectedBlocks, TimeValue maxWaitTime) throws Exception {
        assertBusy(new Runnable() {
            @Override
            public void run() {
                ClusterState state = getNodeClusterState(node);
                assertNull("node [" + node + "] still has [" + state.nodes().masterNode() + "] as master", state.nodes().masterNode());
                if (expectedBlocks != null) {
                    for (ClusterBlockLevel level : expectedBlocks.levels()) {
                        assertTrue("node [" + node + "] does have level [" + level + "] in it's blocks", state.getBlocks().hasGlobalBlock(level));
                    }
                }
            }
        }, maxWaitTime.getMillis(), TimeUnit.MILLISECONDS);
    }


    private void assertDifferentMaster(final String node, final String oldMasterNode) throws Exception {
        assertBusy(new Runnable() {
            @Override
            public void run() {
                ClusterState state = getNodeClusterState(node);
                String masterNode = null;
                if (state.nodes().masterNode() != null) {
                    masterNode = state.nodes().masterNode().name();
                }
                logger.trace("[{}] master is [{}]", node, state.nodes().masterNode());
                assertThat("node [" + node + "] still has [" + masterNode + "] as master",
                        oldMasterNode, not(equalTo(masterNode)));
            }
        }, 10, TimeUnit.SECONDS);
    }


    private void assertMaster(String masterNode, List<String> nodes) {
        for (String node : nodes) {
            ClusterState state = getNodeClusterState(node);
            String failMsgSuffix = "cluster_state:\n" + state.prettyPrint();
            assertThat("wrong node count on [" + node + "]. " + failMsgSuffix, state.nodes().size(), equalTo(nodes.size()));
            assertThat("wrong master on node [" + node + "]. " + failMsgSuffix, state.nodes().masterNode().name(), equalTo(masterNode));
        }
    }
}
Source Code of org.elasticsearch.discovery.DiscoveryWithServiceDisruptions

Related Classes of org.elasticsearch.discovery.DiscoveryWithServiceDisruptions