/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.qjournal.client;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.PriorityQueue;
import java.util.concurrent.TimeoutException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.qjournal.protocol.JournalConfigHelper;
import org.apache.hadoop.hdfs.qjournal.protocol.JournalConfigKeys;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetStorageStateProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.Storage.StorageState;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile;
import org.apache.hadoop.hdfs.server.namenode.ImageInputStream;
import org.apache.hadoop.hdfs.server.namenode.ImageManager;
import org.apache.hadoop.hdfs.server.namenode.JournalManager;
import org.apache.hadoop.hdfs.server.namenode.JournalSet;
import org.apache.hadoop.hdfs.server.namenode.RemoteStorageState;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.server.protocol.RemoteImage;
import org.apache.hadoop.hdfs.server.protocol.RemoteImageManifest;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.StringUtils;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* A JournalManager that writes to a set of remote JournalNodes,
* requiring a quorum of nodes to ack each write.
*/
@InterfaceAudience.Private
public class QuorumJournalManager implements JournalManager, ImageManager {
public static final Log LOG = LogFactory.getLog(QuorumJournalManager.class);
// Timeouts for which the QJM will wait for each of the following actions.
private final int startSegmentTimeoutMs;
private final int prepareRecoveryTimeoutMs;
private final int acceptRecoveryTimeoutMs;
private final int finalizeSegmentTimeoutMs;
private final int selectInputStreamsTimeoutMs;
private final int getImageManifestTimeoutMs;
private final int getJournalStateTimeoutMs;
private final int newEpochTimeoutMs;
private final int writeTxnsTimeoutMs;
private final int httpConnectReadTimeoutMs;
private final int imageUploadBufferSize;
private final int imageUploadMaxBufferedChunks;
// Since these don't occur during normal operation, we can
// use rather lengthy timeouts
private final int dirTransitionTimeoutMs; // 10 mins by default
private final int hasDataTimeoutMs; // 10 mins by default
public static final Object QJM_URI_SCHEME = "qjm";
private final Configuration conf;
private final URI uri;
private final NamespaceInfo nsInfo;
private boolean isActiveWriter;
private final String journalId;
private final AsyncLoggerSet loggers;
private final NameNodeMetrics metrics;
private final boolean hasImageStorage;
private volatile boolean imageDisabled = false;
private final List<String> httpAddresses;
public QuorumJournalManager(Configuration conf, URI uri,
NamespaceInfo nsInfo, NameNodeMetrics metrics, boolean hasImageStroage)
throws IOException {
this(conf, uri, nsInfo, IPCLoggerChannel.FACTORY, metrics, hasImageStroage);
}
public QuorumJournalManager(Configuration conf, URI uri, NamespaceInfo nsInfo,
AsyncLogger.Factory loggerFactory, NameNodeMetrics metrics,
boolean hasImageStorage) throws IOException {
Preconditions.checkArgument(conf != null, "must be configured");
this.conf = conf;
this.uri = uri;
this.nsInfo = nsInfo;
this.loggers = new AsyncLoggerSet(createLoggers(loggerFactory));
this.metrics = metrics;
this.journalId = parseJournalId(uri);
// Configure timeouts.
this.startSegmentTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_START_SEGMENT_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_START_SEGMENT_TIMEOUT_DEFAULT);
this.prepareRecoveryTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_PREPARE_RECOVERY_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_PREPARE_RECOVERY_TIMEOUT_DEFAULT);
this.acceptRecoveryTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_ACCEPT_RECOVERY_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_ACCEPT_RECOVERY_TIMEOUT_DEFAULT);
this.finalizeSegmentTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_FINALIZE_SEGMENT_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_FINALIZE_SEGMENT_TIMEOUT_DEFAULT);
this.selectInputStreamsTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_DEFAULT);
this.getImageManifestTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_GET_IMAGE_MANIFEST_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_GET_IMAGE_MANIFEST_TIMEOUT_DEFAULT);
this.getJournalStateTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_GET_JOURNAL_STATE_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_GET_JOURNAL_STATE_TIMEOUT_DEFAULT);
this.newEpochTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_NEW_EPOCH_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_NEW_EPOCH_TIMEOUT_DEFAULT);
this.writeTxnsTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_WRITE_TXNS_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_WRITE_TXNS_TIMEOUT_DEFAULT);
this.httpConnectReadTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_HTTP_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_HTTP_TIMEOUT_DEFAULT);
this.dirTransitionTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_FORMAT_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_FORMAT_TIMEOUT_DEFAULT);
this.hasDataTimeoutMs = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_HAS_DATA_TIMEOUT_KEY,
JournalConfigKeys.DFS_QJOURNAL_HAS_DATA_TIMEOUT_DEFAULT);
this.hasImageStorage = hasImageStorage;
if (hasImageStorage) {
LOG.info("QJM Journal: " + uri + " will store image.");
}
this.imageUploadBufferSize = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_IMAGE_BUFFER_SIZE_KEY,
JournalConfigKeys.DFS_QJOURNAL_IMAGE_BUFFER_SIZE_DEFAULT);
this.imageUploadMaxBufferedChunks = conf.getInt(
JournalConfigKeys.DFS_QJOURNAL_IMAGE_MAX_BUFFERED_CHUNKS_KEY,
JournalConfigKeys.DFS_QJOURNAL_IMAGE_MAX_BUFFERED_CHUNKS_DEFAULT);
this.httpAddresses = getHttpAddresses();
}
protected List<AsyncLogger> createLoggers(
AsyncLogger.Factory factory) throws IOException {
return createLoggers(conf, uri, nsInfo, factory);
}
static String parseJournalId(URI uri) {
String path = uri.getPath();
Preconditions.checkArgument(path != null && !path.isEmpty(),
"Bad URI '%s': must identify journal in path component",
uri);
String journalId = path.substring(1);
checkJournalId(journalId);
return journalId;
}
public static void checkJournalId(String jid) {
if (jid == null || jid.isEmpty() || jid.startsWith(".")) {
throw new IllegalArgumentException("bad journal id: " + jid);
}
}
/**
* Fence any previous writers, and obtain a unique epoch number
* for write-access to the journal nodes.
*
* @return the new, unique epoch number
*/
Map<AsyncLogger, NewEpochResponseProto> createNewUniqueEpoch()
throws IOException {
Preconditions.checkState(!loggers.isEpochEstablished(),
"epoch already created");
Map<AsyncLogger, GetJournalStateResponseProto> lastPromises =
loggers.waitForWriteQuorum(loggers.getJournalState(),
getJournalStateTimeoutMs, "getJournalState()");
long maxPromised = Long.MIN_VALUE;
for (GetJournalStateResponseProto resp : lastPromises.values()) {
maxPromised = Math.max(maxPromised, resp.getLastPromisedEpoch());
}
assert maxPromised >= 0;
long myEpoch = maxPromised + 1;
Map<AsyncLogger, NewEpochResponseProto> resps =
loggers.waitForWriteQuorum(loggers.newEpoch(nsInfo, myEpoch),
newEpochTimeoutMs, "newEpoch(" + myEpoch + ")");
loggers.setEpoch(myEpoch);
return resps;
}
public void updateNamespaceInfo(StorageInfo si) {
// update nsInfo
nsInfo.layoutVersion = si.layoutVersion;
nsInfo.namespaceID = si.namespaceID;
nsInfo.cTime = si.cTime;
}
private void invokeDirTransition(QuorumCall<AsyncLogger, Void> call,
String methodName) throws IOException {
try {
call.waitFor(loggers.size(), loggers.size(), 0, dirTransitionTimeoutMs,
methodName);
} catch (InterruptedException e) {
throw new IOException("Interrupted waiting for" + methodName
+ "() response");
} catch (TimeoutException e) {
throw new IOException("Timed out waiting for " + methodName
+ "() response");
}
if (call.countExceptions() > 0) {
call.throwQuorumException("Could not upgrade one or more JournalNodes");
}
}
@Override
public void transitionJournal(StorageInfo si, Transition transition,
StartupOption startOpt) throws IOException {
if (Transition.UPGRADE == transition
|| Transition.COMPLETE_UPGRADE == transition
|| Transition.ROLLBACK == transition
|| Transition.FORMAT == transition) {
updateNamespaceInfo(si);
}
invokeDirTransition(loggers.transitionJournal(nsInfo, transition, startOpt),
transition.toString() + " journal");
}
@Override
public boolean hasSomeJournalData() throws IOException {
return hasSomeDataInternal(false);
}
@Override
public boolean hasSomeImageData() throws IOException {
return hasSomeDataInternal(true);
}
/**
* Checks if any data is available in the underlying storage.
* Returns true if any of the nodes has some data.
*/
private boolean hasSomeDataInternal(boolean image) throws IOException {
QuorumCall<AsyncLogger, Boolean> call = image ? loggers.isImageFormatted() :
loggers.isJournalFormatted();
try {
call.waitFor(loggers.size(), 0, 0, hasDataTimeoutMs, "hasSomeData");
} catch (InterruptedException e) {
throw new IOException("Interrupted while determining if JNs have data");
} catch (TimeoutException e) {
throw new IOException("Timed out waiting for response from loggers");
}
if (call.countExceptions() > 0) {
call.throwQuorumException(
"Unable to check if JNs are ready for formatting");
}
// If any of the loggers returned with a non-empty manifest, then
// we should prompt for format.
for (Boolean hasData : call.getResults().values()) {
if (hasData) {
return true;
}
}
// Otherwise, none were formatted, we can safely format.
return false;
}
/**
* Run recovery/synchronization for a specific segment.
* Postconditions:
* <ul>
* <li>This segment will be finalized on a majority
* of nodes.</li>
* <li>All nodes which contain the finalized segment will
* agree on the length.</li>
* </ul>
*
* @param segmentTxId the starting txid of the segment
* @throws IOException
*/
private void recoverUnclosedSegment(long segmentTxId) throws IOException {
Preconditions.checkArgument(segmentTxId > -1);
LOG.info("Beginning recovery of unclosed segment starting at txid " +
segmentTxId);
// Step 1. Prepare recovery
QuorumCall<AsyncLogger,PrepareRecoveryResponseProto> prepare =
loggers.prepareRecovery(segmentTxId);
Map<AsyncLogger, PrepareRecoveryResponseProto> prepareResponses=
loggers.waitForWriteQuorum(prepare, prepareRecoveryTimeoutMs,
"prepareRecovery(" + segmentTxId + ")");
LOG.info("Recovery prepare phase complete. Responses:\n" +
QuorumCall.mapToString(prepareResponses));
// Determine the logger who either:
// a) Has already accepted a previous proposal that's higher than any
// other
//
// OR, if no such logger exists:
//
// b) Has the longest log starting at this transaction ID
// TODO: we should collect any "ties" and pass the URL for all of them
// when syncing, so we can tolerate failure during recovery better.
Entry<AsyncLogger, PrepareRecoveryResponseProto> bestEntry = Collections.max(
prepareResponses.entrySet(), SegmentRecoveryComparator.INSTANCE);
AsyncLogger bestLogger = bestEntry.getKey();
PrepareRecoveryResponseProto bestResponse = bestEntry.getValue();
// Log the above decision, check invariants.
if (bestResponse.hasAcceptedInEpoch()) {
LOG.info("Using already-accepted recovery for segment " +
"starting at txid " + segmentTxId + ": " +
bestEntry);
} else if (bestResponse.hasSegmentState()) {
LOG.info("Using longest log: " + bestEntry);
} else {
// None of the responses to prepareRecovery() had a segment at the given
// txid. This can happen for example in the following situation:
// - 3 JNs: JN1, JN2, JN3
// - writer starts segment 101 on JN1, then crashes before
// writing to JN2 and JN3
// - during newEpoch(), we saw the segment on JN1 and decide to
// recover segment 101
// - before prepare(), JN1 crashes, and we only talk to JN2 and JN3,
// neither of which has any entry for this log.
// In this case, it is allowed to do nothing for recovery, since the
// segment wasn't started on a quorum of nodes.
// Sanity check: we should only get here if none of the responses had
// a log. This should be a postcondition of the recovery comparator,
// but a bug in the comparator might cause us to get here.
for (PrepareRecoveryResponseProto resp : prepareResponses.values()) {
assert !resp.hasSegmentState() :
"One of the loggers had a response, but no best logger " +
"was found.";
}
LOG.info("None of the responders had a log to recover: " +
QuorumCall.mapToString(prepareResponses));
return;
}
SegmentStateProto logToSync = bestResponse.getSegmentState();
assert segmentTxId == logToSync.getStartTxId();
// Sanity check: none of the loggers should be aware of a higher
// txid than the txid we intend to truncate to
for (Map.Entry<AsyncLogger, PrepareRecoveryResponseProto> e :
prepareResponses.entrySet()) {
AsyncLogger logger = e.getKey();
PrepareRecoveryResponseProto resp = e.getValue();
if (resp.hasLastCommittedTxId() &&
resp.getLastCommittedTxId() > logToSync.getEndTxId()) {
throw new AssertionError("Decided to synchronize log to " + logToSync +
" but logger " + logger + " had seen txid " +
resp.getLastCommittedTxId() + " committed");
}
}
URL syncFromUrl = bestLogger.buildURLToFetchLogs(segmentTxId, 0);
QuorumCall<AsyncLogger,Void> accept = loggers.acceptRecovery(logToSync, syncFromUrl.toString());
loggers.waitForWriteQuorum(accept, acceptRecoveryTimeoutMs,
"acceptRecovery(" + logToSync + ")");
// If one of the loggers above missed the synchronization step above, but
// we send a finalize() here, that's OK. It validates the log before
// finalizing. Hence, even if it is not "in sync", it won't incorrectly
// finalize.
QuorumCall<AsyncLogger, Void> finalize =
loggers.finalizeLogSegment(logToSync.getStartTxId(), logToSync.getEndTxId());
loggers.waitForWriteQuorum(finalize, finalizeSegmentTimeoutMs,
String.format("finalizeLogSegment(%s-%s)",
logToSync.getStartTxId(),
logToSync.getEndTxId()));
}
static List<AsyncLogger> createLoggers(Configuration conf,
URI uri, NamespaceInfo nsInfo, AsyncLogger.Factory factory)
throws IOException {
List<AsyncLogger> ret = Lists.newArrayList();
List<InetSocketAddress> addrs = getLoggerAddresses(uri);
String jid = parseJournalId(uri);
for (InetSocketAddress addr : addrs) {
ret.add(factory.createLogger(conf, nsInfo, jid, addr));
}
return ret;
}
private static List<InetSocketAddress> getLoggerAddresses(URI uri)
throws IOException {
String[] parts = parseAddresses(uri);
List<InetSocketAddress> addrs = Lists.newArrayList();
for (String addr : parts) {
addrs.add(NetUtils.createSocketAddr(
addr, JournalConfigKeys.DFS_JOURNALNODE_RPC_PORT_DEFAULT));
}
return addrs;
}
private static String[] parseAddresses(URI uri) {
String authority = uri.getAuthority();
Preconditions.checkArgument(authority != null && !authority.isEmpty(),
"URI has no authority: " + uri);
String[] parts = StringUtils.split(authority, ';');
for (int i = 0; i < parts.length; i++) {
parts[i] = parts[i].trim();
}
if (parts.length % 2 == 0) {
LOG.warn("Quorum journal URI '" + uri + "' has an even number " +
"of Journal Nodes specified. This is not recommended!");
}
return parts;
}
@Override
public EditLogOutputStream startLogSegment(long txId) throws IOException {
Preconditions.checkState(isActiveWriter,
"must recover segments before starting a new one");
QuorumCall<AsyncLogger,Void> q = loggers.startLogSegment(txId);
loggers.waitForWriteQuorum(q, startSegmentTimeoutMs,
"startLogSegment(" + txId + ")");
return new QuorumOutputStream(loggers, txId, FSEditLog.sizeFlushBuffer,
writeTxnsTimeoutMs, metrics, journalId);
}
@Override
public void finalizeLogSegment(long firstTxId, long lastTxId)
throws IOException {
QuorumCall<AsyncLogger,Void> q = loggers.finalizeLogSegment(
firstTxId, lastTxId);
loggers.waitForWriteQuorum(q, finalizeSegmentTimeoutMs,
String.format("finalizeLogSegment(%s-%s)", firstTxId, lastTxId));
}
@Override
public void purgeLogsOlderThan(long minTxIdToKeep) throws IOException {
// This purges asynchronously -- there's no need to wait for a quorum
// here, because it's always OK to fail.
LOG.info("Purging remote journals older than txid " + minTxIdToKeep);
loggers.purgeLogsOlderThan(minTxIdToKeep);
}
@Override
public void setCommittedTxId(long txid, boolean force) {
LOG.info("Set committed transaction ID " + txid + " force=" + force);
loggers.setCommittedTxId(txid, force);
}
@Override
public void recoverUnfinalizedSegments() throws IOException {
Preconditions.checkState(!isActiveWriter, "already active writer");
LOG.info("Starting recovery process for unclosed journal segments...");
Map<AsyncLogger, NewEpochResponseProto> resps = createNewUniqueEpoch();
LOG.info("Successfully started new epoch " + loggers.getEpoch());
if (LOG.isDebugEnabled()) {
LOG.debug("newEpoch(" + loggers.getEpoch() + ") responses:\n" +
QuorumCall.mapToString(resps));
}
long mostRecentSegmentTxId = Long.MIN_VALUE;
for (NewEpochResponseProto r : resps.values()) {
if (r.hasLastSegmentTxId()) {
mostRecentSegmentTxId = Math.max(mostRecentSegmentTxId,
r.getLastSegmentTxId());
}
}
// On a completely fresh system, none of the journals have any
// segments, so there's nothing to recover.
if (mostRecentSegmentTxId != Long.MIN_VALUE) {
recoverUnclosedSegment(mostRecentSegmentTxId);
}
isActiveWriter = true;
}
@Override
public void close() throws IOException {
loggers.close();
}
/**
* Select input streams.
* inProgressOk should be true only for tailing, not for startup
*/
@Override
public void selectInputStreams(Collection<EditLogInputStream> streams,
long fromTxnId, boolean inProgressOk, boolean validateInProgressSegments)
throws IOException {
QuorumCall<AsyncLogger, RemoteEditLogManifest> q =
loggers.getEditLogManifest(fromTxnId);
// we insist on getting all responses, even if they are to be exceptions
// this will fail if we cannot get majority of successes
Map<AsyncLogger, RemoteEditLogManifest> resps = loggers
.waitForReadQuorumWithAllResponses(q, selectInputStreamsTimeoutMs,
"selectInputStreams");
if(LOG.isDebugEnabled()) {
LOG.debug("selectInputStream manifests:\n" +
Joiner.on("\n").withKeyValueSeparator(": ").join(resps));
}
final PriorityQueue<EditLogInputStream> allStreams =
new PriorityQueue<EditLogInputStream>(64,
JournalSet.EDIT_LOG_INPUT_STREAM_COMPARATOR);
for (Map.Entry<AsyncLogger, RemoteEditLogManifest> e : resps.entrySet()) {
AsyncLogger logger = e.getKey();
RemoteEditLogManifest manifest = e.getValue();
for (RemoteEditLog remoteLog : manifest.getLogs()) {
EditLogInputStream elis = new URLLogInputStream(logger,
remoteLog.getStartTxId(), httpConnectReadTimeoutMs);
if (elis.isInProgress() && !inProgressOk) {
continue;
}
allStreams.add(elis);
}
}
// we pass 0 as min redundance as we do not care about this here
JournalSet.chainAndMakeRedundantStreams(
streams, allStreams, fromTxnId, inProgressOk, 0);
}
@Override
public String toString() {
return "QJM to " + loggers;
}
@Override
public String toHTMLString() {
return "QJM to " + loggers.toHTMLString();
}
@VisibleForTesting
AsyncLoggerSet getLoggerSetForTests() {
return loggers;
}
@Override
public RemoteEditLogManifest getEditLogManifest(long fromTxId)
throws IOException {
throw new IOException("Not supported");
}
/**
* Translates byte[] journal id into String. This will be done only the first
* time we are accessing a journal.
*/
public static String journalIdBytesToString(byte[] jid) {
char[] charArray = new char[jid.length];
for (int i = 0; i < jid.length; i++) {
charArray[i] = (char) jid[i];
}
return new String(charArray, 0, charArray.length);
}
/**
* Translates String journal id into byte[].
* We assume the id is ascii.
*/
public static byte[] journalIdStringToBytes(String jid) {
byte[] byteArray = new byte[jid.length()];
for (int i = 0; i < jid.length(); i++) {
byteArray[i] = (byte) jid.charAt(i);
}
return byteArray;
}
@Override
public boolean hasImageStorage() {
return hasImageStorage;
}
/**
* Consistent with JournalNode.getJournalHttpAddress
*/
private List<String> getHttpAddresses() {
String[] hosts = JournalConfigHelper.getJournalHttpHosts(conf);
String pref = "http://";
for (int i = 0; i < hosts.length; i++) {
hosts[i] = pref + hosts[i];
}
return Arrays.asList(hosts);
}
// Imagemanager methods.
@Override
public void transitionImage(StorageInfo si, Transition transition,
StartupOption startOpt) throws IOException {
if (Transition.UPGRADE == transition
|| Transition.COMPLETE_UPGRADE == transition
|| Transition.ROLLBACK == transition
|| Transition.FORMAT == transition) {
updateNamespaceInfo(si);
}
invokeDirTransition(loggers.transitionImage(nsInfo, transition, startOpt),
transition.toString() + " image");
}
/**
* Creates output stream for image at txid to the underlying quorum of journal
* nodes.
*/
@Override
public OutputStream getCheckpointOutputStream(long txid) throws IOException {
return new HttpImageUploadStream(httpAddresses, journalId, nsInfo, txid,
loggers.getEpoch(), imageUploadBufferSize, imageUploadMaxBufferedChunks);
}
/**
* Roll image and save md5 digest to the underlying nodes. This is a quorum
* roll, and we ensure that it can succeed only on the nodes that consumed
* entirely the uploaded image.
*/
@Override
public boolean saveDigestAndRenameCheckpointImage(long txid, MD5Hash digest) {
try {
LOG.info("Saving md5: " + digest + " for txid: " + txid);
QuorumCall<AsyncLogger, Void> q = loggers
.saveDigestAndRenameCheckpointImage(txid, digest);
loggers.waitForWriteQuorum(q, writeTxnsTimeoutMs,
"saveDigestAndRenameCheckpointImage(" + txid + ")");
return true;
} catch (IOException e) {
LOG.error("Exception when rolling the image:", e);
return false;
}
}
/**
* Return true if the last image
*/
@Override
public boolean isImageDisabled() {
return imageDisabled;
}
/**
* Set image status.
*/
@Override
public void setImageDisabled(boolean isDisabled) {
this.imageDisabled = isDisabled;
}
/**
* Get manifest for the images stored in journal nodes. An image is considered
* valid if it appears in majority of the nodes, with a valid md5 sum. The
* returned images are sorted according to their transaction id.
*/
@Override
public RemoteImageManifest getImageManifest(long fromTxnId)
throws IOException {
QuorumCall<AsyncLogger, RemoteImageManifest> q = loggers
.getImageManifest(fromTxnId);
Map<AsyncLogger, RemoteImageManifest> resps = loggers
.waitForReadQuorumWithAllResponses(q, getImageManifestTimeoutMs,
"getImageManifest");
return createImageManifest(resps.values());
}
/**
* Concatenate manifests obtained from the underlying journalnodes. The final
* manifest will contain only the images committed to the majority of the
* nodes. Images with no md5 associated are ignored. Also, the md5 must match
* between images from different journal nodes.
*/
static RemoteImageManifest createImageManifest(
Collection<RemoteImageManifest> resps) throws IOException {
// found valid images (with md5 hash)
Map<Long, RemoteImage> images = Maps.newHashMap();
for (RemoteImageManifest rm : resps) {
for (RemoteImage ri : rm.getImages()) {
if (ri.getDigest() == null) {
LOG.info("Skipping: " + ri + " as it does not have md5 digest");
continue;
}
if (images.containsKey(ri.getTxId())) {
// we already have seen this image
// two images from different nodes should be the same
if (!images.get(ri.getTxId()).equals(ri)) {
throw new IOException(
"Images received from different nodes do not match: "
+ images.get(ri.getTxId()) + " vs: " + ri);
}
} else {
// store image
images.put(ri.getTxId(), ri);
}
}
}
List<RemoteImage> result = Lists.newArrayList();
for (RemoteImage ri : images.values()) {
result.add(ri);
}
// we need to sort the images
Collections.sort(result);
return new RemoteImageManifest(result);
}
/**
* Get latest image committed to the underlying journal nodes.
*/
@Override
public FSImageFile getLatestImage() throws IOException {
List<RemoteImage> images = getImageManifest(HdfsConstants.INVALID_TXID)
.getImages();
// nothing available
if (images.size() == 0) {
return null;
}
return new FSImageFile(null, null, images.get(images.size() - 1).getTxId(),
this);
}
/**
* Get input stream to one of the nodes for given txid.
*/
@Override
public ImageInputStream getImageInputStream(long txid) throws IOException {
URLImageInputStream stream = loggers.getImageInputStream(txid,
httpConnectReadTimeoutMs);
if (stream == null) {
throw new IOException("Cannot obtain input stream for image: " + txid);
}
return new ImageInputStream(txid, stream, stream.getImageDigest(),
stream.toString(), stream.getSize());
}
@Override
public RemoteStorageState analyzeJournalStorage() throws IOException {
return analyzeStorageInternal(false);
}
@Override
public RemoteStorageState analyzeImageStorage() throws IOException {
return analyzeStorageInternal(true);
}
private RemoteStorageState analyzeStorageInternal(boolean image) throws IOException {
QuorumCall<AsyncLogger, GetStorageStateProto> call = image ? loggers
.analyzeImageStorage() : loggers.analyzeJournalStorage();
try {
// we want all responses!
call.waitFor(loggers.size(), loggers.size(), 0, dirTransitionTimeoutMs,
"analyze storage");
} catch (InterruptedException e) {
throw new IOException("Interrupted waiting for " +
(image ? "analyzeImageStorage()" : "analyzeJournalStorage()") + " response");
} catch (TimeoutException e) {
throw new IOException("Timed out waiting for " +
(image ? "analyzeImageStorage()" : "analyzeJournalStorage()") + " response");
}
if (call.countExceptions() > 0) {
call.throwQuorumException("Could not analyze one or more JournalNodes");
}
// iterate through responses and figure out the state
StorageState state = null;
StorageInfo storageInfo = null;
for (GetStorageStateProto r : call.getResults().values()) {
if (state == null) {
state = r.getStorageState();
} else {
if (state != r.getStorageState()) {
state = StorageState.INCONSISTENT;
LOG.warn("Inconsistent state detected: "
+ Arrays.toString(call.getResults().values().toArray()));
return new RemoteStorageState(state, storageInfo);
}
}
if (storageInfo == null) {
storageInfo = r.getStorageInfo();
} else {
if (!storageInfo.equals(r.getStorageInfo())) {
state = StorageState.INCONSISTENT;
LOG.warn("Inconsistent state detected: "
+ Arrays.toString(call.getResults().values().toArray()));
return new RemoteStorageState(state, storageInfo);
}
}
}
// return the state
return new RemoteStorageState(state, storageInfo);
}
@Override
public URI getURI() throws IOException {
return uri;
}
}