package com.alimama.mdrill.editlog;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.alimama.mdrill.editlog.JournalSet;
import com.alimama.mdrill.editlog.JournalSet.JournalAndStream;
import com.alimama.mdrill.editlog.defined.HdfsConstants;
import com.alimama.mdrill.editlog.defined.LogsPurgeable;
import com.alimama.mdrill.editlog.defined.StorageDirectory;
import com.alimama.mdrill.editlog.read.EditLogInputStream;
import com.alimama.mdrill.editlog.read.FSEditLogOp;
import com.alimama.mdrill.editlog.read.RedundantEditLogInputStream;
import com.alimama.mdrill.editlog.util.IOUtils;
import com.alimama.mdrill.editlog.write.EditLogOutputStream;
import com.alimama.mdrill.editlog.write.FileJournalManager;
import org.apache.hadoop.conf.Configuration;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
public class FSEditLog implements LogsPurgeable {
static final Log LOG = LogFactory.getLog(FSEditLog.class);
private enum State {
UNINITIALIZED,
BETWEEN_LOG_SEGMENTS,
IN_SEGMENT,
OPEN_FOR_READING,
CLOSED;
}
private State state = State.UNINITIALIZED;
private JournalSet journalSet = null;
private EditLogOutputStream editLogStream = null;
private long txid = 0;
private long synctxid = 0;
private long curSegmentTxId = HdfsConstants.INVALID_TXID;
private long lastPrintTime;
private volatile boolean isSyncRunning;
private volatile boolean isAutoSyncScheduled = false;
private long numTransactions; // number of transactions
private long numTransactionsBatchedInSync;
private long totalTimeTransactions; // total time for all transactions
private List<StorageDirectory> editsDirs;
private static class TransactionId {
public long txid;
TransactionId(long value) {
this.txid = value;
}
}
private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>() {
@Override
protected synchronized TransactionId initialValue() {
return new TransactionId(Long.MAX_VALUE);
}
};
public FSEditLog(Configuration conf, List<StorageDirectory> editsDirs) {
init(conf, editsDirs);
}
private void init(Configuration conf, List<StorageDirectory> editsDirs) {
isSyncRunning = false;
lastPrintTime = System.currentTimeMillis();
this.editsDirs = Lists.newArrayList(editsDirs);
}
public synchronized void initJournalsForWrite() {
Preconditions.checkState(state == State.UNINITIALIZED ||
state == State.CLOSED, "Unexpected state: %s", state);
initJournals(this.editsDirs);
state = State.BETWEEN_LOG_SEGMENTS;
}
public synchronized void initSharedJournalsForRead() {
if (state == State.OPEN_FOR_READING) {
LOG.warn("Initializing shared journals for READ, already open for READ",
new Exception());
return;
}
Preconditions.checkState(state == State.UNINITIALIZED ||
state == State.CLOSED);
initJournals(this.editsDirs);
state = State.OPEN_FOR_READING;
}
private synchronized void initJournals(List<StorageDirectory> dirs) {
journalSet = new JournalSet(1);
for (StorageDirectory sd : dirs) {
journalSet.add(new FileJournalManager(sd), sd.isrequire());
}
if (journalSet.isEmpty()) {
LOG.error("No edits directories configured!");
}
}
/**
* Get the list of URIs the editlog is using for storage
* @return collection of URIs in use by the edit log
*/
Collection<StorageDirectory> getEditURIs() {
return editsDirs;
}
/**
* Initialize the output stream for logging, opening the first
* log segment.
*/
public synchronized void openForWrite() throws IOException {
Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS, "Bad state: %s", state);
long segmentTxId = getLastWrittenTxId() + 1;
// Safety check: we should never start a segment if there are
// newer txids readable.
List<EditLogInputStream> streams = new ArrayList<EditLogInputStream>();
journalSet.selectInputStreams(streams, segmentTxId, true, true);
if (!streams.isEmpty()) {
String error = String.format("Cannot start writing at txid %s " +
"when there is a stream available for read: %s",
segmentTxId, streams.get(0));
IOUtils.cleanup(LOG, streams.toArray(new EditLogInputStream[0]));
throw new IllegalStateException(error);
}
startLogSegment(segmentTxId, true);
assert state == State.IN_SEGMENT : "Bad state: " + state;
}
/**
* @return true if the log is currently open in write mode, regardless
* of whether it actually has an open segment.
*/
synchronized boolean isOpenForWrite() {
return state == State.IN_SEGMENT ||
state == State.BETWEEN_LOG_SEGMENTS;
}
/**
* @return true if the log is open in write mode and has a segment open
* ready to take edits.
*/
synchronized boolean isSegmentOpen() {
return state == State.IN_SEGMENT;
}
/**
* @return true if the log is open in read mode.
*/
public synchronized boolean isOpenForRead() {
return state == State.OPEN_FOR_READING;
}
/**
* Shutdown the file store.
* @throws IOException
*/
synchronized void close() throws IOException {
if (state == State.CLOSED) {
LOG.debug("Closing log when already closed");
return;
}
try {
if (state == State.IN_SEGMENT) {
assert editLogStream != null;
waitForSyncToFinish();
endCurrentLogSegment(true);
}
} finally {
if (journalSet != null && !journalSet.isEmpty()) {
try {
journalSet.close();
} catch (IOException ioe) {
LOG.warn("Error closing journalSet", ioe);
}
}
state = State.CLOSED;
}
}
public void logEdit(final FSEditLogOp op) throws Exception {
synchronized (this) {
assert isOpenForWrite() :
"bad state: " + state;
// wait if an automatic sync is scheduled
waitIfAutoSyncScheduled();
long start = beginTransaction();
op.setTransactionId(txid);
try {
editLogStream.write(op);
} catch (IOException ex) {
// All journals failed, it is handled in logSync.
}
endTransaction(start);
isAutoSyncScheduled = true;
}
// sync buffered edit log entries to persistent store
logSync();
}
/**
* Wait if an automatic sync is scheduled
* @throws InterruptedException
*/
synchronized void waitIfAutoSyncScheduled() {
try {
while (isAutoSyncScheduled) {
this.wait(1000);
}
} catch (InterruptedException e) {
}
}
/**
* Signal that an automatic sync scheduling is done if it is scheduled
*/
synchronized void doneWithAutoSyncScheduling() {
if (isAutoSyncScheduled) {
isAutoSyncScheduled = false;
notifyAll();
}
}
private long beginTransaction() {
assert Thread.holdsLock(this);
// get a new transactionId
txid++;
//
// record the transactionId when new data was written to the edits log
//
TransactionId id = myTransactionId.get();
id.txid = txid;
return System.currentTimeMillis();
}
private void endTransaction(long start) {
assert Thread.holdsLock(this);
// update statistics
long end = System.currentTimeMillis();
numTransactions++;
totalTimeTransactions += (end-start);
}
/**
* Return the transaction ID of the last transaction written to the log.
*/
public synchronized long getLastWrittenTxId() {
return txid;
}
/**
* @return the first transaction ID in the current log segment
*/
synchronized long getCurSegmentTxId() {
Preconditions.checkState(isSegmentOpen(),
"Bad state: %s", state);
return curSegmentTxId;
}
/**
* Set the transaction ID to use for the next transaction written.
*/
public synchronized void setNextTxId(long nextTxId) {
Preconditions.checkArgument(synctxid <= txid &&
nextTxId >= txid,
"May not decrease txid." +
" synctxid=%s txid=%s nextTxId=%s",
synctxid, txid, nextTxId);
txid = nextTxId - 1;
}
/**
* Blocks until all ongoing edits have been synced to disk.
* This differs from logSync in that it waits for edits that have been
* written by other threads, not just edits from the calling thread.
*
* NOTE: this should be done while holding the FSNamesystem lock, or
* else more operations can start writing while this is in progress.
* @throws Exception
*/
void logSyncAll() throws Exception {
// Record the most recent transaction ID as our own id
synchronized (this) {
TransactionId id = myTransactionId.get();
id.txid = txid;
}
// Then make sure we're synced up to this point
logSync();
}
public static void terminate(int status, String msg) throws IOException {
LOG.info("Exiting with status " + status);
IOException ee = new IOException(status + "@" + msg);
LOG.error("Terminate called", ee);
throw ee;
}
public void logSync() throws IOException {
long syncStart = 0;
// Fetch the transactionId of this thread.
long mytxid = myTransactionId.get().txid;
boolean sync = false;
try {
EditLogOutputStream logStream = null;
synchronized (this) {
try {
printStatistics(false);
// if somebody is already syncing, then wait
while (mytxid > synctxid && isSyncRunning) {
try {
wait(1000);
} catch (InterruptedException ie) {
}
}
if (mytxid <= synctxid) {
numTransactionsBatchedInSync++;
return;
}
// now, this thread will do the sync
syncStart = txid;
isSyncRunning = true;
sync = true;
// swap buffers
try {
if (journalSet.isEmpty()) {
throw new IOException("No journals available to flush");
}
editLogStream.setReadyToFlush();
} catch (IOException e) {
final String msg =
"Could not sync enough journals to persistent storage " +
"due to " + e.getMessage() + ". " +
"Unsynced transactions: " + (txid - synctxid);
LOG.fatal(msg, new Exception());
IOUtils.cleanup(LOG, journalSet);
terminate(1, msg);
}
} finally {
// Prevent RuntimeException from blocking other log edit write
doneWithAutoSyncScheduling();
}
//editLogStream may become null,
//so store a local variable for flush.
logStream = editLogStream;
}
// do the sync
long start = System.currentTimeMillis();
try {
if (logStream != null) {
logStream.flush();
}
} catch (IOException ex) {
synchronized (this) {
final String msg =
"Could not sync enough journals to persistent storage. "
+ "Unsynced transactions: " + (txid - synctxid);
LOG.fatal(msg, new Exception());
IOUtils.cleanup(LOG, journalSet);
terminate(1, msg);
}
}
} finally {
// Prevent RuntimeException from blocking other log edit sync
synchronized (this) {
if (sync) {
synctxid = syncStart;
isSyncRunning = false;
}
this.notifyAll();
}
}
}
//
// print statistics every 1 minute.
//
private void printStatistics(boolean force) {
long now = System.currentTimeMillis();
if (lastPrintTime + 60000 > now && !force) {
return;
}
lastPrintTime = now;
StringBuilder buf = new StringBuilder();
buf.append("Number of transactions: ");
buf.append(numTransactions);
buf.append(" Total time for transactions(ms): ");
buf.append(totalTimeTransactions);
buf.append(" Number of transactions batched in Syncs: ");
buf.append(numTransactionsBatchedInSync);
buf.append(" Number of syncs: ");
buf.append(editLogStream.getNumSync());
LOG.info(buf);
}
/**
* Get all the journals this edit log is currently operating on.
*/
synchronized List<JournalAndStream> getJournals() {
return journalSet.getAllJournalStreams();
}
public synchronized long rollEditLog() throws IOException {
LOG.info("Rolling edit logs");
endCurrentLogSegment(true);
long nextTxId = getLastWrittenTxId() + 1;
startLogSegment(nextTxId, true);
assert curSegmentTxId == nextTxId;
return nextTxId;
}
/**
* Start writing to the log segment with the given txid.
* Transitions from BETWEEN_LOG_SEGMENTS state to IN_LOG_SEGMENT state.
*/
synchronized void startLogSegment(final long segmentTxId,
boolean writeHeaderTxn) throws IOException {
LOG.info("Starting log segment at " + segmentTxId);
Preconditions.checkArgument(segmentTxId > 0,
"Bad txid: %s", segmentTxId);
Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS,
"Bad state: %s", state);
Preconditions.checkState(segmentTxId > curSegmentTxId,
"Cannot start writing to log segment " + segmentTxId +
" when previous log segment started at " + curSegmentTxId);
Preconditions.checkArgument(segmentTxId == txid + 1,
"Cannot start log segment at txid %s when next expected " +
"txid is %s", segmentTxId, txid + 1);
numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0;
try {
editLogStream = journalSet.startLogSegment(segmentTxId);
} catch (IOException ex) {
throw new IOException("Unable to start log segment " +
segmentTxId + ": too few journals successfully started.", ex);
}
curSegmentTxId = segmentTxId;
state = State.IN_SEGMENT;
if (writeHeaderTxn) {
logSync();
}
}
public synchronized void endCurrentLogSegment(boolean writeEndTxn) throws IOException {
LOG.info("Ending log segment " + curSegmentTxId);
Preconditions.checkState(isSegmentOpen(),
"Bad state: %s", state);
if (writeEndTxn) {
logSync();
}
printStatistics(true);
final long lastTxId = getLastWrittenTxId();
try {
journalSet.finalizeLogSegment(curSegmentTxId, lastTxId);
editLogStream = null;
} catch (IOException e) {
//All journals have failed, it will be handled in logSync.
}
state = State.BETWEEN_LOG_SEGMENTS;
}
/**
* Abort all current logs. Called from the backup node.
*/
synchronized void abortCurrentLogSegment() {
try {
//Check for null, as abort can be called any time.
if (editLogStream != null) {
editLogStream.abort();
editLogStream = null;
state = State.BETWEEN_LOG_SEGMENTS;
}
} catch (IOException e) {
LOG.warn("All journals failed to abort", e);
}
}
/**
* Archive any log files that are older than the given txid.
*
* If the edit log is not open for write, then this call returns with no
* effect.
*/
@Override
public synchronized void purgeLogsOlderThan(final long minTxIdToKeep) {
// Should not purge logs unless they are open for write.
// This prevents the SBN from purging logs on shared storage, for example.
if (!isOpenForWrite()) {
return;
}
assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this is no-op
minTxIdToKeep <= curSegmentTxId :
"cannot purge logs older than txid " + minTxIdToKeep +
" when current segment starts at " + curSegmentTxId;
if (minTxIdToKeep == 0) {
return;
}
// This could be improved to not need synchronization. But currently,
// journalSet is not threadsafe, so we need to synchronize this method.
try {
journalSet.purgeLogsOlderThan(minTxIdToKeep);
} catch (IOException ex) {
//All journals have failed, it will be handled in logSync.
}
}
/**
* The actual sync activity happens while not synchronized on this object.
* Thus, synchronized activities that require that they are not concurrent
* with file operations should wait for any running sync to finish.
*/
synchronized void waitForSyncToFinish() {
while (isSyncRunning) {
try {
wait(1000);
} catch (InterruptedException ie) {}
}
}
/**
* Return the txid of the last synced transaction.
* For test use only
*/
synchronized long getSyncTxId() {
return synctxid;
}
// sets the initial capacity of the flush buffer.
synchronized void setOutputBufferCapacity(int size) {
journalSet.setOutputBufferCapacity(size);
}
synchronized void logEdit(final int length, final byte[] data) {
long start = beginTransaction();
try {
editLogStream.writeRaw(data, 0, length);
} catch (IOException ex) {
// All journals have failed, it will be handled in logSync.
}
endTransaction(start);
}
public synchronized void recoverUnclosedStreams() {
Preconditions.checkState(
state == State.BETWEEN_LOG_SEGMENTS,
"May not recover segments - wrong state: %s", state);
try {
journalSet.recoverUnfinalizedSegments();
} catch (IOException ex) {
// All journals have failed, it is handled in logSync.
// TODO: are we sure this is OK?
}
}
@Override
public void selectInputStreams(Collection<EditLogInputStream> streams,
long fromTxId, boolean inProgressOk, boolean forReading) throws IOException {
journalSet.selectInputStreams(streams, fromTxId, inProgressOk, forReading);
}
public Collection<EditLogInputStream> selectInputStreams(
long fromTxId, long toAtLeastTxId) throws IOException {
return selectInputStreams(fromTxId, toAtLeastTxId, true);
}
/** Select a list of input streams to load */
public Collection<EditLogInputStream> selectInputStreams(
long fromTxId, long toAtLeastTxId,
boolean inProgressOk) throws IOException {
return selectInputStreams(fromTxId, toAtLeastTxId, inProgressOk,
true);
}
/**
* Select a list of input streams.
*
* @param fromTxId first transaction in the selected streams
* @param toAtLeast the selected streams must contain this transaction
* @param inProgessOk set to true if in-progress streams are OK
* @param forReading whether or not to use the streams to load the edit log
*/
public synchronized Collection<EditLogInputStream> selectInputStreams(
long fromTxId, long toAtLeastTxId,
boolean inProgressOk, boolean forReading) throws IOException {
List<EditLogInputStream> streams = new ArrayList<EditLogInputStream>();
selectInputStreams(streams, fromTxId, inProgressOk, forReading);
try {
checkForGaps(streams, fromTxId, toAtLeastTxId, inProgressOk);
} catch (IOException e) {
closeAllStreams(streams);
throw e;
}
return streams;
}
/**
* Check for gaps in the edit log input stream list.
* Note: we're assuming that the list is sorted and that txid ranges don't
* overlap. This could be done better and with more generality with an
* interval tree.
*/
private void checkForGaps(List<EditLogInputStream> streams, long fromTxId,
long toAtLeastTxId, boolean inProgressOk) throws IOException {
Iterator<EditLogInputStream> iter = streams.iterator();
long txId = fromTxId;
while (true) {
if (txId > toAtLeastTxId) return;
if (!iter.hasNext()) break;
EditLogInputStream elis = iter.next();
if (elis.getFirstTxId() > txId) break;
long next = elis.getLastTxId();
if (next == HdfsConstants.INVALID_TXID) {
if (!inProgressOk) {
throw new RuntimeException("inProgressOk = false, but " +
"selectInputStreams returned an in-progress edit " +
"log input stream (" + elis + ")");
}
// We don't know where the in-progress stream ends.
// It could certainly go all the way up to toAtLeastTxId.
return;
}
txId = next + 1;
}
throw new IOException(String.format("Gap in transactions. Expected to "
+ "be able to read up until at least txid %d but unable to find any "
+ "edit logs containing txid %d", toAtLeastTxId, txId));
}
/**
* Close all the streams in a collection
* @param streams The list of streams to close
*/
public static void closeAllStreams(Iterable<EditLogInputStream> streams) {
for (EditLogInputStream s : streams) {
IOUtils.closeStream(s);
}
}
}