/*
* Copyright (c) 2014 Spotify AB.
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.helios.system;
import com.spotify.docker.client.DockerClient;
import com.spotify.docker.client.messages.Container;
import com.spotify.helios.Polling;
import com.spotify.helios.agent.AgentMain;
import com.spotify.helios.client.HeliosClient;
import com.spotify.helios.common.descriptors.Deployment;
import com.spotify.helios.common.descriptors.Job;
import com.spotify.helios.common.descriptors.JobId;
import com.spotify.helios.common.descriptors.JobStatus;
import com.spotify.helios.common.descriptors.TaskStatus;
import com.spotify.helios.common.protocol.CreateJobResponse;
import com.spotify.helios.common.protocol.JobDeployResponse;
import org.junit.Test;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.Callable;
import static com.spotify.helios.common.descriptors.Goal.START;
import static com.spotify.helios.common.descriptors.HostStatus.Status.UP;
import static com.spotify.helios.common.descriptors.TaskStatus.State.RUNNING;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
public class AgentZooKeeperDownTolerationTest extends SystemTestBase {
@Test
public void test() throws Exception {
startDefaultMaster();
final DockerClient dockerClient = getNewDockerClient();
final HeliosClient client = defaultClient();
final AgentMain agent1 = startDefaultAgent(testHost());
// Create a job
final Job job = Job.newBuilder()
.setName(testJobName)
.setVersion(testJobVersion)
.setImage(BUSYBOX)
.setCommand(IDLE_COMMAND)
.setCreatingUser(TEST_USER)
.build();
final JobId jobId = job.getId();
final CreateJobResponse created = client.createJob(job).get();
assertEquals(CreateJobResponse.Status.OK, created.getStatus());
// Wait for agent to come up
awaitHostRegistered(client, testHost(), LONG_WAIT_SECONDS, SECONDS);
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Deploy the job on the agent
final Deployment deployment = Deployment.of(jobId, START);
final JobDeployResponse deployed = client.deploy(deployment, testHost()).get();
assertEquals(JobDeployResponse.Status.OK, deployed.getStatus());
// Wait for the job to run
final TaskStatus firstTaskStatus = awaitJobState(client, testHost(), jobId, RUNNING,
LONG_WAIT_SECONDS, SECONDS);
assertEquals(job, firstTaskStatus.getJob());
assertNotNull(dockerClient.inspectContainer(firstTaskStatus.getContainerId()));
// Stop zookeeper
zk().stop();
// Wait for a while and make sure that the container is still running
Thread.sleep(5000);
assertTrue(
dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Stop the agent
agent1.stopAsync().awaitTerminated();
// Start the agent again
final AgentMain agent2 = startDefaultAgent(testHost());
// Wait for a while and make sure that the same container is still running
Thread.sleep(5000);
assertTrue(
dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Kill the container
dockerClient.killContainer(firstTaskStatus.getContainerId());
assertFalse(
dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Wait for a while and make sure that a new container was spawned
final String firstRestartedContainerId =
Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<String>() {
@Override
public String call() throws Exception {
final List<Container> containers = listContainers(dockerClient, testTag);
return containers.size() == 1 ? containers.get(0).id() : null;
}
});
// Stop the agent
agent2.stopAsync().awaitTerminated();
// Kill the container
dockerClient.killContainer(firstRestartedContainerId);
assertFalse(dockerClient.inspectContainer(firstRestartedContainerId).state().running());
// Start the agent again
startDefaultAgent(testHost());
// Wait for a while and make sure that a new container was spawned
final String secondRestartedContainerId =
Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<String>() {
@Override
public String call() throws Exception {
final List<Container> containers = listContainers(dockerClient, testTag);
return containers.size() == 1 ? containers.get(0).id() : null;
}
});
assertTrue(dockerClient.inspectContainer(secondRestartedContainerId).state().running());
// Start zookeeper
zk().start();
// Verify that the agent is listed as up
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the new container id to be reflected in the task status
Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<TaskStatus>() {
@Override
public TaskStatus call() throws Exception {
final JobStatus jobStatus = client.jobStatus(jobId).get();
final TaskStatus taskStatus = jobStatus.getTaskStatuses().get(testHost());
return taskStatus != null && Objects.equals(taskStatus.getContainerId(),
secondRestartedContainerId)
? taskStatus : null;
}
});
}
}