/*
* TODO: We could be smarter here and group the shards by index and then
* use the sorter to save some iterations.
*/
final AllocationDeciders deciders = allocation.deciders();
final Comparator<MutableShardRouting> comparator = new Comparator<MutableShardRouting>() {
@Override
public int compare(MutableShardRouting o1,
MutableShardRouting o2) {
if (o1.primary() ^ o2.primary()) {
return o1.primary() ? -1 : o2.primary() ? 1 : 0;
}
final int indexCmp;
if ((indexCmp = o1.index().compareTo(o2.index())) == 0) {
return o1.getId() - o2.getId();
}
return indexCmp;
}
};
/*
* we use 2 arrays and move replicas to the second array once we allocated an identical
* replica in the current iteration to make sure all indices get allocated in the same manner.
* The arrays are sorted by primaries first and then by index and shard ID so a 2 indices with 2 replica and 1 shard would look like:
* [(0,P,IDX1), (0,P,IDX2), (0,R,IDX1), (0,R,IDX1), (0,R,IDX2), (0,R,IDX2)]
* if we allocate for instance (0, R, IDX1) we move the second replica to the secondary array and proceed with
* the next replica. If we could not find a node to allocate (0,R,IDX1) we move all it's replicas to ingoreUnassigned.
*/
MutableShardRouting[] primary = unassigned.drain();
MutableShardRouting[] secondary = new MutableShardRouting[primary.length];
int secondaryLength = 0;
int primaryLength = primary.length;
ArrayUtil.timSort(primary, comparator);
final Set<ModelNode> throttledNodes = new IdentityHashSet<>();
do {
for (int i = 0; i < primaryLength; i++) {
MutableShardRouting shard = primary[i];
if (!shard.primary()) {
boolean drop = deciders.canAllocate(shard, allocation).type() == Type.NO;
if (drop) {
ignoredUnassigned.add(shard);
while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
ignoredUnassigned.add(primary[++i]);
}
continue;
} else {
while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
secondary[secondaryLength++] = primary[++i];
}
}
}
assert !shard.assignedToNode() : shard;
/* find an node with minimal weight we can allocate on*/
float minWeight = Float.POSITIVE_INFINITY;
ModelNode minNode = null;
Decision decision = null;
if (throttledNodes.size() < nodes.size()) {
/* Don't iterate over an identity hashset here the
* iteration order is different for each run and makes testing hard */
for (ModelNode node : nodes.values()) {
if (throttledNodes.contains(node)) {
continue;
}
/*
* The shard we add is removed below to simulate the
* addition for weight calculation we use Decision.ALWAYS to
* not violate the not null condition.
*/
if (!node.containsShard(shard)) {
node.addShard(shard, Decision.ALWAYS);
float currentWeight = weight.weight(Operation.ALLOCATE, this, node, shard.index());
/*
* Remove the shard from the node again this is only a
* simulation
*/
Decision removed = node.removeShard(shard);
assert removed != null;
/*
* Unless the operation is not providing any gains we
* don't check deciders
*/
if (currentWeight <= minWeight) {
Decision currentDecision = deciders.canAllocate(shard, routingNodes.node(node.getNodeId()), allocation);
NOUPDATE:
if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
if (currentWeight == minWeight) {
/* we have an equal weight tie breaking:
* 1. if one decision is YES prefer it
* 2. prefer the node that holds the primary for this index with the next id in the ring ie.
* for the 3 shards 2 replica case we try to build up:
* 1 2 0
* 2 0 1
* 0 1 2
* such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
* than the id of the shard we need to assign. This works find when new indices are created since
* primaries are added first and we only add one shard set a time in this algorithm.
*/
if (currentDecision.type() == decision.type()) {
final int repId = shard.id();
final int nodeHigh = node.highestPrimary(shard.index());
final int minNodeHigh = minNode.highestPrimary(shard.index());
if ((((nodeHigh > repId && minNodeHigh > repId) || (nodeHigh < repId && minNodeHigh < repId)) && (nodeHigh < minNodeHigh))
|| (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId)) {
minNode = node;
minWeight = currentWeight;
decision = currentDecision;
} else {
break NOUPDATE;
}
} else if (currentDecision.type() != Type.YES) {
break NOUPDATE;
}
}
minNode = node;
minWeight = currentWeight;
decision = currentDecision;
}
}
}
}
}
assert decision != null && minNode != null || decision == null && minNode == null;
if (minNode != null) {
minNode.addShard(shard, decision);
if (decision.type() == Type.YES) {
if (logger.isTraceEnabled()) {
logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
}
routingNodes.assign(shard, routingNodes.node(minNode.getNodeId()).nodeId());
changed = true;
continue; // don't add to ignoreUnassigned
} else {
final RoutingNode node = routingNodes.node(minNode.getNodeId());
if (deciders.canAllocate(node, allocation).type() != Type.YES) {
if (logger.isTraceEnabled()) {
logger.trace("Can not allocate on node [{}] remove from round decisin [{}]", node, decision.type());
}
throttledNodes.add(minNode);
}