Package org.elasticsearch.cluster.node

Examples of org.elasticsearch.cluster.node.DiscoveryNodes$Delta

        RoutingTable routingTable = RoutingTable.builder()

        DiscoveryNodes nodes = DiscoveryNodes.builder().put(newNode("node1")).put(newNode("node2")).put(newNode("node3")).build();

        ClusterState clusterState = ClusterState.builder(org.elasticsearch.cluster.ClusterName.DEFAULT).nodes(nodes).metaData(metaData).routingTable(routingTable).build();

        AllocationService strategy = createAllocationService();
        RoutingTable source = strategy.reroute(clusterState).routingTable();
    public void close() {

    public void verify(String repository, String verificationToken, final ActionListener<VerifyResponse> listener) {
        final DiscoveryNodes discoNodes = clusterService.state().nodes();
        final DiscoveryNode localNode = discoNodes.localNode();

        final ObjectContainer<DiscoveryNode> masterAndDataNodes = discoNodes.masterAndDataNodes().values();
        final List<DiscoveryNode> nodes = newArrayList();
        for (ObjectCursor<DiscoveryNode> cursor : masterAndDataNodes) {
            DiscoveryNode node = cursor.value;
            Version version = node.getVersion();
            // Verification wasn't supported before v1.4.0 - no reason to send verification request to these nodes
            // Check if we just became the master
            final boolean newMaster = !event.previousState().nodes().localNodeMaster();
            clusterService.submitStateUpdateTask("update snapshot state after node removal", new ClusterStateUpdateTask() {
                public ClusterState execute(ClusterState currentState) throws Exception {
                    DiscoveryNodes nodes = currentState.nodes();
                    MetaData metaData = currentState.metaData();
                    MetaData.Builder mdBuilder = MetaData.builder(currentState.metaData());
                    SnapshotMetaData snapshots = metaData.custom(SnapshotMetaData.TYPE);
                    if (snapshots == null) {
                        return currentState;
                    boolean changed = false;
                    ArrayList<SnapshotMetaData.Entry> entries = newArrayList();
                    for (final SnapshotMetaData.Entry snapshot : snapshots.entries()) {
                        SnapshotMetaData.Entry updatedSnapshot = snapshot;
                        boolean snapshotChanged = false;
                        if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
                            ImmutableMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableMap.builder();
                            for (ImmutableMap.Entry<ShardId, ShardSnapshotStatus> shardEntry : snapshot.shards().entrySet()) {
                                ShardSnapshotStatus shardStatus = shardEntry.getValue();
                                if (!shardStatus.state().completed() && shardStatus.nodeId() != null) {
                                    if (nodes.nodeExists(shardStatus.nodeId())) {
                                    } else {
                                        // TODO: Restart snapshot on another node?
                                        snapshotChanged = true;
                                        logger.warn("failing snapshot of shard [{}] on closed node [{}]", shardEntry.getKey(), shardStatus.nodeId());
        private void handleNodePingRequest(int id, DiscoveryNode requestingNodeX, ClusterName requestClusterName) {
            if (!pingEnabled || multicastChannel == null) {
            final DiscoveryNodes discoveryNodes = contextProvider.nodes();
            final DiscoveryNode requestingNode = requestingNodeX;
            if ( {
                // that's me, ignore
            if (!requestClusterName.equals(clusterName)) {
                if (logger.isTraceEnabled()) {
                    logger.trace("[{}] received ping_request from [{}], but wrong cluster_name [{}], expected [{}], ignoring",
                            id, requestingNode, requestClusterName.value(), clusterName.value());
            // don't connect between two client nodes, no need for that...
            if (!discoveryNodes.localNode().shouldConnectTo(requestingNode)) {
                if (logger.isTraceEnabled()) {
                    logger.trace("[{}] received ping_request from [{}], both are client nodes, ignoring", id, requestingNode, requestClusterName);
            final MulticastPingResponse multicastPingResponse = new MulticastPingResponse();
   = id;
            multicastPingResponse.pingResponse = new PingResponse(discoveryNodes.localNode(), discoveryNodes.masterNode(), clusterName, contextProvider.nodeHasJoinedClusterOnce());

            if (logger.isTraceEnabled()) {
                logger.trace("[{}] received ping_request from [{}], sending {}", id, requestingNode, multicastPingResponse.pingResponse);
                    if (settings.get(SETTING_AUTO_EXPAND_REPLICAS) != null && indexSettingsBuilder.get(SETTING_AUTO_EXPAND_REPLICAS) == null) {
                        indexSettingsBuilder.put(SETTING_AUTO_EXPAND_REPLICAS, settings.get(SETTING_AUTO_EXPAND_REPLICAS));

                    if (indexSettingsBuilder.get(SETTING_VERSION_CREATED) == null) {
                        DiscoveryNodes nodes = currentState.nodes();
                        final Version createdVersion = Version.smallest(version, nodes.smallestNonClientNodeVersion());
                        indexSettingsBuilder.put(SETTING_VERSION_CREATED, createdVersion);

                    if (indexSettingsBuilder.get(SETTING_CREATION_DATE) == null) {
                        indexSettingsBuilder.put(SETTING_CREATION_DATE, System.currentTimeMillis());
            checkStateMeetsSettingsAndMaybeRecover(event.state(), true);

    protected void checkStateMeetsSettingsAndMaybeRecover(ClusterState state, boolean asyncRecovery) {
        DiscoveryNodes nodes = state.nodes();
        if (state.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
            logger.debug("not recovering from gateway, no master elected yet");
        } else if (recoverAfterNodes != -1 && (nodes.masterAndDataNodes().size()) < recoverAfterNodes) {
            logger.debug("not recovering from gateway, nodes_size (data+master) [" + nodes.masterAndDataNodes().size() + "] < recover_after_nodes [" + recoverAfterNodes + "]");
        } else if (recoverAfterDataNodes != -1 && nodes.dataNodes().size() < recoverAfterDataNodes) {
            logger.debug("not recovering from gateway, nodes_size (data) [" + nodes.dataNodes().size() + "] < recover_after_data_nodes [" + recoverAfterDataNodes + "]");
        } else if (recoverAfterMasterNodes != -1 && nodes.masterNodes().size() < recoverAfterMasterNodes) {
            logger.debug("not recovering from gateway, nodes_size (master) [" + nodes.masterNodes().size() + "] < recover_after_master_nodes [" + recoverAfterMasterNodes + "]");
        } else {
            boolean enforceRecoverAfterTime;
            String reason;
            if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) {
                // no expected is set, honor the setting if they are there
                enforceRecoverAfterTime = true;
                reason = "recovery_after_time was set to [" + recoverAfterTime + "]";
            } else {
                // one of the expected is set, see if all of them meet the need, and ignore the timeout in this case
                enforceRecoverAfterTime = false;
                reason = "";
                if (expectedNodes != -1 && (nodes.masterAndDataNodes().size() < expectedNodes)) { // does not meet the expected...
                    enforceRecoverAfterTime = true;
                    reason = "expecting [" + expectedNodes + "] nodes, but only have [" + nodes.masterAndDataNodes().size() + "]";
                } else if (expectedDataNodes != -1 && (nodes.dataNodes().size() < expectedDataNodes)) { // does not meet the expected...
                    enforceRecoverAfterTime = true;
                    reason = "expecting [" + expectedDataNodes + "] data nodes, but only have [" + nodes.dataNodes().size() + "]";
                } else if (expectedMasterNodes != -1 && (nodes.masterNodes().size() < expectedMasterNodes)) { // does not meet the expected...
                    enforceRecoverAfterTime = true;
                    reason = "expecting [" + expectedMasterNodes + "] master nodes, but only have [" + nodes.masterNodes().size() + "]";
            performStateRecovery(asyncRecovery, enforceRecoverAfterTime, reason);
                final LocalDiscovery master = firstMaster;
                master.clusterService.submitStateUpdateTask("local-disco-update", new ClusterStateNonMasterUpdateTask() {
                    public ClusterState execute(ClusterState currentState) {
                        DiscoveryNodes newNodes = currentState.nodes().removeDeadMembers(newMembers,;
                        DiscoveryNodes.Delta delta =;
                        if (delta.added()) {
                            logger.warn("No new nodes should be created when a new discovery view is accepted");
                        // reroute here, so we eagerly remove dead nodes from the routing
                        ClusterState updatedState = ClusterState.builder(currentState).nodes(newNodes).build();
        return table;

    private Table buildTable(RestRequest request, ClusterStateResponse state) {
        Table table = getTableWithHeader(request);
        DiscoveryNodes nodes = state.getState().nodes();

        DiscoveryNode master = nodes.get(nodes.masterNodeId());
        if (master == null) {
            return new MasterPingRequest();

        public void messageReceived(final MasterPingRequest request, final TransportChannel channel) throws Exception {
            final DiscoveryNodes nodes = clusterService.state().nodes();
            // check if we are really the same master as the one we seemed to be think we are
            // this can happen if the master got "kill -9" and then another node started using the same port
            if (!request.masterNodeId.equals(nodes.localNodeId())) {
                throw new NotMasterException();

            // ping from nodes of version < 1.4.0 will have the clustername set to null
            if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
                logger.trace("master fault detection ping request is targeted for a different [{}] cluster then us [{}]", request.clusterName, clusterName);
                throw new NotMasterException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");

            // when we are elected as master or when a node joins, we use a cluster state update thread
            // to incorporate that information in the cluster state. That cluster state is published
            // before we make it available locally. This means that a master ping can come from a node
            // that has already processed the new CS but it is not known locally.
            // Therefore, if we fail we have to check again under a cluster state thread to make sure
            // all processing is finished.

            if (!nodes.localNodeMaster() || !nodes.nodeExists(request.nodeId)) {
                logger.trace("checking ping from [{}] under a cluster state thread", request.nodeId);
                clusterService.submitStateUpdateTask("master ping (from: [" + request.nodeId + "])", new ProcessedClusterStateNonMasterUpdateTask() {

                    public ClusterState execute(ClusterState currentState) throws Exception {
                        // if we are no longer master, fail...
                        DiscoveryNodes nodes = currentState.nodes();
                        if (!nodes.localNodeMaster()) {
                            throw new NoLongerMasterException();
                        if (!nodes.nodeExists(request.nodeId)) {
                            throw new NodeDoesNotExistOnMasterException();
                        return currentState;
    public boolean allocateUnassigned(RoutingAllocation allocation) {
        boolean changed = false;
        DiscoveryNodes nodes = allocation.nodes();
        RoutingNodes routingNodes = allocation.routingNodes();

        // First, handle primaries, they must find a place to be allocated on here
        Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
        while (unassignedIterator.hasNext()) {
            MutableShardRouting shard =;

            if (!shard.primary()) {

            // this is an API allocation, ignore since we know there is no data...
            if (!routingNodes.routingTable().index(shard.index()).shard( {

            ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

            int numberOfAllocationsFound = 0;
            long highestVersion = -1;
            Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
            final boolean[] states = nodesState.allocated;
            final Object[] keys = nodesState.keys;
            final long[] values = nodesState.values;
            for (int i = 0; i < states.length; i++) {
                if (!states[i]) {

                DiscoveryNode node = (DiscoveryNode) keys[i];
                long version = values[i];
                // since we don't check in NO allocation, we need to double check here
                if (allocation.shouldIgnoreShardForNode(shard.shardId(), {
                if (version != -1) {
                    if (highestVersion == -1) {
                        highestVersion = version;
                    } else {
                        if (version > highestVersion) {
                            highestVersion = version;
                        } else if (version == highestVersion) {

            // check if the counts meets the minimum set
            int requiredAllocation = 1;
            try {
                IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
                String initialShards = indexMetaData.settings().get(INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
                if ("quorum".equals(initialShards)) {
                    if (indexMetaData.numberOfReplicas() > 1) {
                        requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
                } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
                    if (indexMetaData.numberOfReplicas() > 2) {
                        requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
                } else if ("one".equals(initialShards)) {
                    requiredAllocation = 1;
                } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
                    requiredAllocation = indexMetaData.numberOfReplicas() + 1;
                } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
                    if (indexMetaData.numberOfReplicas() > 1) {
                        requiredAllocation = indexMetaData.numberOfReplicas();
                } else {
                    requiredAllocation = Integer.parseInt(initialShards);
            } catch (Exception e) {
                logger.warn("[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(),, initialShards, shard);

            // not enough found for this shard, continue...
            if (numberOfAllocationsFound < requiredAllocation) {
                // we can't really allocate, so ignore it and continue
                if (logger.isDebugEnabled()) {
                    logger.debug("[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(),, numberOfAllocationsFound, requiredAllocation);

            Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
            Set<DiscoveryNode> noNodes = Sets.newHashSet();
            for (DiscoveryNode discoNode : nodesWithHighestVersion) {
                RoutingNode node = routingNodes.node(;
                if (node == null) {

                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
                if (decision.type() == Decision.Type.THROTTLE) {
                } else if (decision.type() == Decision.Type.NO) {
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(),, shard, discoNode);
                    // we found a match
                    changed = true;
                    // make sure we create one with the version from the recovered state
                    allocation.routingNodes().assign(new MutableShardRouting(shard, highestVersion), node.nodeId());

                    // found a node, so no throttling, no "no", and break out of the loop
            if (throttledNodes.isEmpty()) {
                // if we have a node that we "can't" allocate to, force allocation, since this is our master data!
                if (!noNodes.isEmpty()) {
                    DiscoveryNode discoNode = noNodes.iterator().next();
                    RoutingNode node = routingNodes.node(;
                    if (logger.isDebugEnabled()) {
                        logger.debug("[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(),, shard, discoNode);
                    // we found a match
                    changed = true;
                    // make sure we create one with the version from the recovered state
                    allocation.routingNodes().assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
            } else {
                if (logger.isDebugEnabled()) {
                    logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(),, shard, throttledNodes);
                // we are throttling this, but we have enough to allocate to this node, ignore it for now

        if (!routingNodes.hasUnassigned()) {
            return changed;

        // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was allocated on
        unassignedIterator = routingNodes.unassigned().iterator();
        while (unassignedIterator.hasNext()) {
            MutableShardRouting shard =;

            // pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
            boolean canBeAllocatedToAtLeastOneNode = false;
            for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
                RoutingNode node = routingNodes.node(;
                if (node == null) {
                // if we can't allocate it on a node, ignore it, for example, this handles
                // cases for only allocating a replica after a primary
                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
                if (decision.type() == Decision.Type.YES) {
                    canBeAllocatedToAtLeastOneNode = true;

            if (!canBeAllocatedToAtLeastOneNode) {

            Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = buildShardStores(nodes, shard);

            long lastSizeMatched = 0;
            DiscoveryNode lastDiscoNodeMatched = null;
            RoutingNode lastNodeMatched = null;

            for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> nodeStoreEntry : shardStores.entrySet()) {
                DiscoveryNode discoNode = nodeStoreEntry.getKey();
                TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue();
                logger.trace("{}: checking node [{}]", shard, discoNode);

                if (storeFilesMetaData == null) {
                    // already allocated on that node...

                RoutingNode node = routingNodes.node(;
                if (node == null) {

                // check if we can allocate on that node...
                // we only check for NO, since if this node is THROTTLING and it has enough "same data"
                // then we will try and assign it next time
                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
                if (decision.type() == Decision.Type.NO) {

                // if it is already allocated, we can't assign to it...
                if (storeFilesMetaData.allocated()) {

                if (!shard.primary()) {
                    MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
                    if (primaryShard != null) {
                        DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
                        if (primaryNode != null) {
                            TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = shardStores.get(primaryNode);
                            if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                                long sizeMatched = 0;
