public void phase1(final SnapshotIndexCommit snapshot) throws ElasticsearchException {
// Total size of segment files that are recovered
long totalSize = 0;
// Total size of segment files that were able to be re-used
long existingTotalSize = 0;
final Store store = shard.store();
store.incRef();
try {
StopWatch stopWatch = new StopWatch().start();
final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot);
for (String name : snapshot.getFiles()) {
final StoreFileMetaData md = recoverySourceMetadata.get(name);
if (md == null) {
logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap());
throw new CorruptIndexException("Snapshot differs from actual index - maybe index was removed metadata has " +
recoverySourceMetadata.asMap().size() + " files", name);
}
}
// Generate a "diff" of all the identical, different, and missing
// segment files on the target node, using the existing files on
// the source node
final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(new Store.MetadataSnapshot(request.existingFiles()));
for (StoreFileMetaData md : diff.identical) {
response.phase1ExistingFileNames.add(md.name());
response.phase1ExistingFileSizes.add(md.length());
existingTotalSize += md.length();
if (logger.isTraceEnabled()) {
logger.trace("[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]",
indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length());
}
totalSize += md.length();
}
for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) {
if (request.existingFiles().containsKey(md.name())) {
logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
indexName, shardId, request.targetNode(), md.name(), request.existingFiles().get(md.name()), md);
} else {
logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote",
indexName, shardId, request.targetNode(), md.name());
}
response.phase1FileNames.add(md.name());
response.phase1FileSizes.add(md.length());
totalSize += md.length();
}
response.phase1TotalSize = totalSize;
response.phase1ExistingTotalSize = existingTotalSize;
logger.trace("[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]",
indexName, shardId, request.targetNode(), response.phase1FileNames.size(),
new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize));
RecoveryFilesInfoRequest recoveryInfoFilesRequest = new RecoveryFilesInfoRequest(request.recoveryId(), request.shardId(),
response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes,
response.phase1TotalSize, response.phase1ExistingTotalSize);
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILES_INFO, recoveryInfoFilesRequest,
TransportRequestOptions.options().withTimeout(internalActionTimeout),
EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
// This latch will be used to wait until all files have been transferred to the target node
final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size());
final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>();
final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>();
int fileIndex = 0;
ThreadPoolExecutor pool;
for (final String name : response.phase1FileNames) {
long fileSize = response.phase1FileSizes.get(fileIndex);
// Files are split into two categories, files that are "small"
// (under 5mb) and other files. Small files are transferred
// using a separate thread pool dedicated to small files.
//
// The idea behind this is that while we are transferring an
// older, large index, a user may create a new index, but that
// index will not be able to recover until the large index
// finishes, by using two different thread pools we can allow
// tiny files (like segments for a brand new index) to be
// recovered while ongoing large segment recoveries are
// happening. It also allows these pools to be configured
// separately.
if (fileSize > recoverySettings.SMALL_FILE_CUTOFF_BYTES) {
pool = recoverySettings.concurrentStreamPool();
} else {
pool = recoverySettings.concurrentSmallFileStreamPool();
}
pool.execute(new Runnable() {
@Override
public void run() {
store.incRef();
final StoreFileMetaData md = recoverySourceMetadata.get(name);
try (final IndexInput indexInput = store.directory().openInput(name, IOContext.READONCE)) {
final int BUFFER_SIZE = (int) recoverySettings.fileChunkSize().bytes();
final byte[] buf = new byte[BUFFER_SIZE];
boolean shouldCompressRequest = recoverySettings.compress();
if (CompressorFactory.isCompressed(indexInput)) {
shouldCompressRequest = false;
}
long len = indexInput.length();
long readCount = 0;
TransportRequestOptions requestOptions = TransportRequestOptions.options()
.withCompress(shouldCompressRequest)
.withType(TransportRequestOptions.Type.RECOVERY)
.withTimeout(internalActionTimeout);
while (readCount < len) {
if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us
throw new IndexShardClosedException(shard.shardId());
}
int toRead = readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE;
long position = indexInput.getFilePointer();
// Pause using the rate limiter, if desired, to throttle the recovery
if (recoverySettings.rateLimiter() != null) {
recoverySettings.rateLimiter().pause(toRead);
}
indexInput.readBytes(buf, 0, toRead, false);
BytesArray content = new BytesArray(buf, 0, toRead);
readCount += toRead;
// Actually send the file chunk to the target node, waiting for it to complete
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILE_CHUNK,
new RecoveryFileChunkRequest(request.recoveryId(), request.shardId(), md, position, content, readCount == len),
requestOptions, EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
}
} catch (Throwable e) {
final Throwable corruptIndexException;
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
if (store.checkIntegrity(md) == false) { // we are corrupted on the primary -- fail!
logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) {
// if we are not the first exception, add ourselves as suppressed to the main one:
corruptedEngine.get().addSuppressed(e);
}
} else { // corruption has happened on the way to replica
RemoteTransportException exception = new RemoteTransportException("File corruption occured on recovery but checksums are ok", null);
exception.addSuppressed(e);
exceptions.add(0, exception); // last exception first
logger.warn("{} File corruption on recovery {} local checksum OK",
corruptIndexException, shard.shardId(), md);
}
} else {
exceptions.add(0, e); // last exceptions first
}
} finally {
try {
store.decRef();
} finally {
// Signify this file has completed by decrementing the latch
latch.countDown();
}
}
}
});
fileIndex++;
}
// Wait for all files that need to be transferred to finish transferring
latch.await();
if (corruptedEngine.get() != null) {
throw corruptedEngine.get();
} else {
ExceptionsHelper.rethrowAndSuppress(exceptions);
}
Set<String> snapshotFiles = Sets.newHashSet(snapshot.getFiles());
// Send the CLEAN_FILES request, which takes all of the files that
// were transferred and renames them from their temporary file
// names to the actual file names. It also writes checksums for
// the files after they have been renamed.
//
// Once the files have been renamed, any other files that are not
// related to this recovery (out of date segments, for example)
// are deleted
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.CLEAN_FILES,
new RecoveryCleanFilesRequest(request.recoveryId(), shard.shardId(), snapshotFiles),
TransportRequestOptions.options().withTimeout(internalActionTimeout),
EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
stopWatch.stop();
logger.trace("[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime());
response.phase1Time = stopWatch.totalTime().millis();
} catch (Throwable e) {
throw new RecoverFilesRecoveryException(request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
} finally {
store.decRef();
}
}