for (int pId : pSet) {
final String pName = pName(jobResource, pId);
// Check for pending state transitions on this (partition, instance).
String pendingState =
currStateOutput.getPendingState(jobResource, new Partition(pName), instance);
if (pendingState != null) {
// There is a pending state transition for this (partition, instance). Just copy forward
// the state assignment from the previous ideal state.
Map<String, String> stateMap = prevAssignment.getReplicaMap(new Partition(pName));
if (stateMap != null) {
String prevState = stateMap.get(instance);
paMap.put(pId, new PartitionAssignment(instance, prevState));
assignedPartitions.add(pId);
LOG.debug(String
.format(
"Task partition %s has a pending state transition on instance %s. Using the previous ideal state which was %s.",
pName, instance, prevState));
}
continue;
}
TaskPartitionState currState =
TaskPartitionState.valueOf(currStateOutput.getCurrentState(jobResource, new Partition(
pName), instance));
jobCtx.setPartitionState(pId, currState);
// Process any requested state transitions.
String requestedStateStr =
currStateOutput.getRequestedState(jobResource, new Partition(pName), instance);
if (requestedStateStr != null && !requestedStateStr.isEmpty()) {
TaskPartitionState requestedState = TaskPartitionState.valueOf(requestedStateStr);
if (requestedState.equals(currState)) {
LOG.warn(String.format(
"Requested state %s is the same as the current state for instance %s.",
requestedState, instance));
}
paMap.put(pId, new PartitionAssignment(instance, requestedState.name()));
assignedPartitions.add(pId);
LOG.debug(String.format(
"Instance %s requested a state transition to %s for partition %s.", instance,
requestedState, pName));
continue;
}
switch (currState) {
case RUNNING:
case STOPPED: {
TaskPartitionState nextState;
if (jobTgtState == TargetState.START) {
nextState = TaskPartitionState.RUNNING;
} else {
nextState = TaskPartitionState.STOPPED;
}
paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
assignedPartitions.add(pId);
LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName,
nextState, instance));
}
break;
case COMPLETED: {
// The task has completed on this partition. Mark as such in the context object.
donePartitions.add(pId);
LOG.debug(String
.format(
"Task partition %s has completed with state %s. Marking as such in rebalancer context.",
pName, currState));
partitionsToDropFromIs.add(pId);
markPartitionCompleted(jobCtx, pId);
}
break;
case TIMED_OUT:
case TASK_ERROR:
case ERROR: {
donePartitions.add(pId); // The task may be rescheduled on a different instance.
LOG.debug(String.format(
"Task partition %s has error state %s. Marking as such in rebalancer context.",
pName, currState));
markPartitionError(jobCtx, pId, currState, true);
// The error policy is to fail the task as soon a single partition fails for a specified
// maximum number of attempts.
if (jobCtx.getPartitionNumAttempts(pId) >= jobCfg.getMaxAttemptsPerTask()) {
// If the user does not require this task to succeed in order for the job to succeed,
// then we don't have to fail the job right now
boolean successOptional = false;
String taskId = jobCtx.getTaskIdForPartition(pId);
if (taskId != null) {
TaskConfig taskConfig = jobCfg.getTaskConfig(taskId);
if (taskConfig != null) {
successOptional = taskConfig.isSuccessOptional();
}
}
// Similarly, if we have some leeway for how many tasks we can fail, then we don't have
// to fail the job immediately
if (skippedPartitions.size() < jobCfg.getFailureThreshold()) {
successOptional = true;
}
if (!successOptional) {
long finishTime = currentTime;
workflowCtx.setJobState(jobResource, TaskState.FAILED);
if (workflowConfig.isTerminable()) {
workflowCtx.setWorkflowState(TaskState.FAILED);
workflowCtx.setFinishTime(finishTime);
}
jobCtx.setFinishTime(finishTime);
markAllPartitionsError(jobCtx, currState, false);
addAllPartitions(allPartitions, partitionsToDropFromIs);
return emptyAssignment(jobResource, currStateOutput);
} else {
skippedPartitions.add(pId);
partitionsToDropFromIs.add(pId);
}
} else {
// Mark the task to be started at some later time (if enabled)
markPartitionDelayed(jobCfg, jobCtx, pId);
}
}
break;
case INIT:
case DROPPED: {
// currState in [INIT, DROPPED]. Do nothing, the partition is eligible to be reassigned.
donePartitions.add(pId);
LOG.debug(String.format(
"Task partition %s has state %s. It will be dropped from the current ideal state.",
pName, currState));
}
break;
default:
throw new AssertionError("Unknown enum symbol: " + currState);
}
}
// Remove the set of task partitions that are completed or in one of the error states.
pSet.removeAll(donePartitions);
}
// For delayed tasks, trigger a rebalance event for the closest upcoming ready time
scheduleForNextTask(jobResource, jobCtx, currentTime);
if (isJobComplete(jobCtx, allPartitions, skippedPartitions)) {
workflowCtx.setJobState(jobResource, TaskState.COMPLETED);
jobCtx.setFinishTime(currentTime);
if (isWorkflowComplete(workflowCtx, workflowConfig)) {
workflowCtx.setWorkflowState(TaskState.COMPLETED);
workflowCtx.setFinishTime(currentTime);
}
}
// Make additional task assignments if needed.
if (jobTgtState == TargetState.START) {
// Contains the set of task partitions that must be excluded from consideration when making
// any new assignments.
// This includes all completed, failed, delayed, and already assigned partitions.
Set<Integer> excludeSet = Sets.newTreeSet(assignedPartitions);
addCompletedPartitions(excludeSet, jobCtx, allPartitions);
excludeSet.addAll(skippedPartitions);
excludeSet.addAll(getNonReadyPartitions(jobCtx, currentTime));
// Get instance->[partition, ...] mappings for the target resource.
Map<String, SortedSet<Integer>> tgtPartitionAssignments =
getTaskAssignment(currStateOutput, prevAssignment, liveInstances, jobCfg, jobCtx,
workflowConfig, workflowCtx, allPartitions, cache);
for (Map.Entry<String, SortedSet<Integer>> entry : taskAssignments.entrySet()) {
String instance = entry.getKey();
if (!tgtPartitionAssignments.containsKey(instance)) {
continue;
}
// Contains the set of task partitions currently assigned to the instance.
Set<Integer> pSet = entry.getValue();
int numToAssign = jobCfg.getNumConcurrentTasksPerInstance() - pSet.size();
if (numToAssign > 0) {
List<Integer> nextPartitions =
getNextPartitions(tgtPartitionAssignments.get(instance), excludeSet, numToAssign);
for (Integer pId : nextPartitions) {
String pName = pName(jobResource, pId);
paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.RUNNING.name()));
excludeSet.add(pId);
jobCtx.setAssignedParticipant(pId, instance);
jobCtx.setPartitionState(pId, TaskPartitionState.INIT);
LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName,
TaskPartitionState.RUNNING, instance));
}
}
}
}
// Construct a ResourceAssignment object from the map of partition assignments.
ResourceAssignment ra = new ResourceAssignment(jobResource);
for (Map.Entry<Integer, PartitionAssignment> e : paMap.entrySet()) {
PartitionAssignment pa = e.getValue();
ra.addReplicaMap(new Partition(pName(jobResource, e.getKey())),
ImmutableMap.of(pa._instance, pa._state));
}
return ra;
}