private void deployJobRetry(final ZooKeeperClient client, final String host,
final Deployment deployment, int count)
throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException,
JobPortAllocationConflictException {
if (count == 3) {
throw new HeliosRuntimeException("3 failures (possibly concurrent modifications) while " +
"deploying. Giving up.");
}
log.info("deploying {}: {} (retry={})", deployment, host, count);
final JobId id = deployment.getJobId();
final Job job = getJob(id);
if (job == null) {
throw new JobDoesNotExistException(id);
}
final UUID operationId = UUID.randomUUID();
final String jobPath = Paths.configJob(id);
final String taskPath = Paths.configHostJob(host, id);
final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId);
final List<Integer> staticPorts = staticPorts(job);
final Map<String, byte[]> portNodes = Maps.newHashMap();
final byte[] idJson = id.toJsonBytes();
for (final int port : staticPorts) {
final String path = Paths.configHostPort(host, port);
portNodes.put(path, idJson);
}
final Task task = new Task(job, deployment.getGoal(), deployment.getDeployerUser());
final List<ZooKeeperOperation> operations = Lists.newArrayList(
check(jobPath),
create(portNodes),
create(Paths.configJobHost(id, host)));
// Attempt to read a task here.
try {
client.getNode(taskPath);
// if we get here the node exists already
throw new JobAlreadyDeployedException(host, id);
} catch (NoNodeException e) {
operations.add(create(taskPath, task));
operations.add(create(taskCreationPath));
} catch (KeeperException e) {
throw new HeliosRuntimeException("reading existing task description failed", e);
}
// TODO (dano): Failure handling is racy wrt agent and job modifications.
try {
client.transaction(operations);
log.info("deployed {}: {} (retry={})", deployment, host, count);
} catch (NoNodeException e) {
// Either the job, the host or the task went away
assertJobExists(client, id);
assertHostExists(client, host);
// If the job and host still exists, we likely tried to redeploy a job that had an UNDEPLOY
// goal and lost the race with the agent removing the task before we could set it. Retry.
deployJobRetry(client, host, deployment, count + 1);
} catch (NodeExistsException e) {
// Check for conflict due to transaction retry
try {
if (client.exists(taskCreationPath) != null) {
// Our creation operation node existed, we're done here
return;
}
} catch (KeeperException ex) {
throw new HeliosRuntimeException("checking job deployment failed", ex);
}
try {
// Check if the job was already deployed
if (client.stat(taskPath) != null) {
throw new JobAlreadyDeployedException(host, id);
}
} catch (KeeperException ex) {
throw new HeliosRuntimeException("checking job deployment failed", e);
}
// Check for static port collisions
for (final int port : staticPorts) {
final String path = Paths.configHostPort(host, port);
try {
if (client.stat(path) == null) {
continue;
}
final byte[] b = client.getData(path);
final JobId existingJobId = parse(b, JobId.class);
throw new JobPortAllocationConflictException(id, existingJobId, host, port);
} catch (KeeperException | IOException ex) {
throw new HeliosRuntimeException("checking port allocations failed", e);
}
}
// Catch all for logic and ephemeral issues
throw new HeliosRuntimeException("deploying job failed", e);
} catch (KeeperException e) {
throw new HeliosRuntimeException("deploying job failed", e);
}
}