Package org.apache.hadoop.hdfs.server.namenode

Source Code of org.apache.hadoop.hdfs.server.namenode.FSEditLog$BlockTwo

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.zip.Checksum;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.Storage.FormatConfirmable;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.*;
import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.StorageLocationType;
import org.apache.hadoop.hdfs.server.namenode.ValidateNamespaceDirPolicy.NNStorageLocation;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.io.*;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.util.PureJavaCrc32;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.*;

import com.google.common.collect.Lists;

/**
* FSEditLog maintains a log of the namespace modifications.
*
*/
public class FSEditLog {
 
  static final Log LOG = LogFactory.getLog(FSEditLog.class);

  public static final long PURGE_ALL_TXID = Long.MAX_VALUE;
  public static String CONF_ROLL_TIMEOUT_MSEC = "dfs.fsedits.timeout.roll.edits.msec";
 
  public static int sizeFlushBuffer = HdfsConstants.DEFAULT_EDIT_BUFFER_SIZE;
  static long preallocateSize= HdfsConstants.DEFAULT_EDIT_PREALLOCATE_SIZE;
  static long maxBufferedTransactions= HdfsConstants.DEFAULT_MAX_BUFFERED_TRANSACTIONS;
  private final ConcurrentSkipListMap<Long, List<Long>> delayedSyncs =
    new ConcurrentSkipListMap<Long, List<Long>>();
  private Thread syncThread;
  private SyncThread syncer;
 
  /**
   * State machine for edit log. The log starts in UNITIALIZED state upon
   * construction. Once it's initialized, it is usually in IN_SEGMENT state,
   * indicating that edits may be written. In the middle of a roll, or while
   * saving the namespace, it briefly enters the BETWEEN_LOG_SEGMENTS state,
   * indicating that the previous segment has been closed, but the new one has
   * not yet been opened.
   */
  protected enum State {
    UNINITIALIZED,
    BETWEEN_LOG_SEGMENTS,
    IN_SEGMENT,
    CLOSED;
  }

  protected State state = State.UNINITIALIZED;

  // initialize
  private JournalSet journalSet;
  private EditLogOutputStream editLogStream = null;

  // a monotonically increasing counter that represents transactionIds.
  private long txid = -1;

  // stores the last synced transactionId.
  private long synctxid = -1;

  // the first txid of the log that's currently open for writing.
  // If this value is N, we are currently writing to edits_inprogress_N
  private long curSegmentTxId = HdfsConstants.INVALID_TXID;

  // the time of printing the statistics to the log file.
  private long lastPrintTime;

  // is a sync currently running?
  private volatile boolean isSyncRunning;
   
  // Used to exit in the event of a failure to sync to all journals. It's a
  // member variable so it can be swapped out for testing.
  static volatile Runtime runtime = Runtime.getRuntime();

  // these are statistics counters.
  private long numTransactions;        // number of transactions
  private long numTransactionsBatchedInSync;
  private long totalTimeTransactions;  // total time for all transactions
  private NameNodeMetrics metrics;
 
  private NNStorage storage; 
  private Configuration conf;
  private Collection<URI> editsDirs;
  private long timeoutRollEdits;

  private static ThreadLocal<Checksum> localChecksumForRead = new ThreadLocal<Checksum>() {
    protected Checksum initialValue() {
      return new PureJavaCrc32();
    }
  };

  private static ThreadLocal<Checksum> localChecksumForWrite = new ThreadLocal<Checksum>() {
    protected Checksum initialValue() {
      return new PureJavaCrc32();
    }
  };

  /** Get a thread local checksum for read */
  static Checksum getChecksumForRead() {
    return localChecksumForRead.get();
  }
 
  /** Get a thread local checksum for read */
  static Checksum getChecksumForWrite() {
    return localChecksumForWrite.get();
  }

  /**
   * Sets the current transaction id of the edit log. This is used when we load
   * the FSImage and FSEdits and read the last transaction id from disk and then
   * we continue logging transactions to the edit log from that id onwards.
   *
   * @param txid
   *          the last transaction id
   */
  public void setLastWrittenTxId(long txid) {
    this.txid = txid;
  }
 
  public void resetTxIds(long txid) throws IOException {
    this.txid = txid;
    this.synctxid = txid;
    this.curSegmentTxId = HdfsConstants.INVALID_TXID;
    this.state = State.BETWEEN_LOG_SEGMENTS;
   
    // Journals need to reset their committed IDs.
    journalSet.setCommittedTxId(txid, true);
  }

  private static class TransactionId {
    public long txid;

    TransactionId(long value) {
      this.txid = value;
    }
  }

  // stores the most current transactionId of this thread.
  private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>() {
    protected synchronized TransactionId initialValue() {
      return new TransactionId(-1L);
    }
  };
 
  /**
   * Constructor for FSEditLog. Underlying journals are constructed, but no
   * streams are opened until open() is called.
   *
   * @param conf The namenode configuration
   * @param storage Storage object used by namenode
   * @param editsDirs List of journals to use
   * @param locationMap contains information about shared/local/remote locations
   */
  FSEditLog(Configuration conf, FSImage image, NNStorage storage,
      Collection<URI> imageDirs, Collection<URI> editsDirs,
      Map<URI, NNStorageLocation> locationMap) {
    init(conf, image, storage, imageDirs, editsDirs, locationMap);
    timeoutRollEdits = conf.getLong(CONF_ROLL_TIMEOUT_MSEC, 0);
  }
 
  private void init(Configuration conf, FSImage image, NNStorage storage,
      Collection<URI> imageDirs, Collection<URI> editsDirs,
      Map<URI, NNStorageLocation> locationMap) {

    isSyncRunning = false;
    this.conf = conf;
    this.storage = storage;
    metrics = NameNode.getNameNodeMetrics();
    lastPrintTime = FSNamesystem.now();

    // If this list is empty, an error will be thrown on first use
    // of the editlog, as no journals will exist
    this.editsDirs = new ArrayList<URI>(editsDirs);

    journalSet = new JournalSet(conf, image, storage, this.editsDirs.size(),
        metrics);
   
    for (URI u : this.editsDirs) {
      boolean required = NNStorageConfiguration.getRequiredNamespaceEditsDirs(
          conf).contains(u);
      boolean shared = false;
      boolean remote = false;
      if (locationMap != null && locationMap.get(u) != null) {
        shared = locationMap.get(u).type == StorageLocationType.SHARED;      
        remote = locationMap.get(u).type == StorageLocationType.REMOTE;
      }
      if (u.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
        StorageDirectory sd = storage.getStorageDirectory(u);
        if (sd != null) {
          LOG.info("Adding local file journal: " + u + ", required: " + required);
          // port error reporter
          journalSet.add(new FileJournalManager(sd, metrics, null), required, shared,
              remote);
        }
      } else if (u.getScheme().equals(QuorumJournalManager.QJM_URI_SCHEME)) {
        // for now, we only allow the QJM to store images
        boolean hasImageStorage = imageDirs.contains(u);
        try {
          journalSet.add(new QuorumJournalManager(conf, u, new NamespaceInfo(
              storage), metrics, hasImageStorage), required, shared, remote);
        } catch (Exception e) {
          throw new IllegalArgumentException("Unable to construct journal, " + u,
              e);
        }    
      } else {
        LOG.info("Adding journal: " + u + ", required: " + required);
        journalSet.add(createJournal(conf, u, new NamespaceInfo(storage),
            metrics), required, shared, remote);
      }
    }
    if (journalSet.isEmpty()) {
      LOG.error("No edits directories configured!");
    }
    state = State.BETWEEN_LOG_SEGMENTS;
  }

  /**
   * Get the list of URIs the editlog is using for storage
   *
   * @return collection of URIs in use by the edit log
   */
  Collection<URI> getEditURIs() {
    return editsDirs;
  }
 
  /**
   * Create empty edit log files.
   * Initialize the output stream for logging.
   *
   * @throws IOException
   */
  synchronized void open() throws IOException {
    if (syncer == null) {
      syncer = new SyncThread();
      syncThread = new Thread(syncer);
      syncThread.start();
    }
    if (state != State.BETWEEN_LOG_SEGMENTS)
      throw new IOException("Bad state: " + state);

    startLogSegment(getLastWrittenTxId() + 1, true);
    if (state != State.IN_SEGMENT)
      throw new IOException("Bad state: " + state);
  }

  synchronized boolean isOpen() {
    return state == State.IN_SEGMENT;
  }

  public synchronized void close() throws IOException {
    if (state == State.CLOSED) {
      LOG.info("Closing log when already closed");
      return;
    }
    if (state == State.IN_SEGMENT) {
      assert editLogStream != null;
      waitForSyncToFinish();
      endCurrentLogSegment(true && InjectionHandler
          .trueCondition(InjectionEvent.FSEDIT_LOG_WRITE_END_LOG_SEGMENT));
    }

    if (syncThread != null) {
      syncer.stop();
      syncThread.interrupt();
    }

    try {
      journalSet.close();
    } catch (IOException ioe) {
      LOG.warn("Error closing journalSet", ioe);   
    }
    state = State.CLOSED;
  }
 
  synchronized void transitionNonFileJournals(StorageInfo nsInfo,
      boolean checkEmpty, Transition transition, StartupOption startOpt)
      throws IOException {
    if (Transition.FORMAT == transition
        && state != State.BETWEEN_LOG_SEGMENTS) {
      throw new IOException("Bad state:" + state);
    }
    journalSet.transitionNonFileJournals(nsInfo, checkEmpty,
        transition, startOpt);
  }

  synchronized List<JournalManager> getNonFileJournalManagers() {
    return journalSet.getNonFileJournalManagers();
  }
 
  synchronized List<FormatConfirmable> getFormatConfirmables()
      throws IOException {
    if (state != State.BETWEEN_LOG_SEGMENTS) {
      throw new IOException("Bad state:" + state);
    }

    List<FormatConfirmable> ret = Lists.newArrayList();
    for (final JournalManager jm : journalSet.getJournalManagers()) {
      // The FJMs are confirmed separately since they are also
      // StorageDirectories
      if (!(jm instanceof FileJournalManager)) {
        ret.add(jm);
      }
    }
    return ret;
  }

  void logEdit(final FSEditLogOp op) {
    synchronized (this) {
      assert state != State.CLOSED;
      // this will increase txid
      long start = beginTransaction();
      op.setTransactionId(txid);

      try {
        if (editLogStream != null) {
          // if stream is null it will be handled in sync
          editLogStream.write(op);
        }
      } catch (IOException ex) {
        LOG.fatal("Could not write to required number of streams", ex);
        runtime.exit(1);
      }
      endTransaction(start);
      // check if it is time to schedule an automatic sync
    }
  }

  /**
   * Check if should automatically sync buffered edits to persistent store
   *
   * @return true if any of the edit stream says that it should sync
   */
  private boolean shouldForceSync() {
    // if editLogStream is null, just fast fail
    return editLogStream == null ? true : editLogStream.shouldForceSync();
  }

  private long beginTransaction() {
    assert Thread.holdsLock(this);
    // get a new transactionId
    txid++;

    //
    // record the transactionId when new data was written to the edits log
    //
    TransactionId id = myTransactionId.get();
    id.txid = txid;
    // obtain time in nanoseconds
    // endTransaction will compute time in microseconds
    return System.nanoTime();

  }

  private void endTransaction(long start) {
    assert Thread.holdsLock(this);

    // update statistics
    numTransactions++;
    long txnTime = DFSUtil.getElapsedTimeMicroSeconds(start);
    totalTimeTransactions += txnTime;
    if (metrics != null) { // Metrics is non-null only when used inside name
                           // node
      metrics.transactions.inc(txnTime);
      metrics.numBufferedTransactions.set((int) (txid - synctxid));
      metrics.currentTxnId.set(txid);
    }

  }

  /**
   * Blocks until all ongoing edits have been synced to disk. This differs from
   * logSync in that it waits for edits that have been written by other threads,
   * not just edits from the calling thread.
   *
   * NOTE: this should be done while holding the FSNamesystem lock, or else more
   * operations can start writing while this is in progress.
   */
  public void logSyncAll() throws IOException {
    // Record the most recent transaction ID as our own id
    synchronized (this) {
      TransactionId id = myTransactionId.get();
      id.txid = txid;
    }
    // Then make sure we're synced up to this point
    logSync();
  }

  /**
   * if there are too many transactions that are yet to be synced,
   * then sync them. Otherwise, the in-memory buffer that keeps
   * the transactions would grow to be very very big. This can happen
   * when there are a large number of listStatus calls which update
   * the access time of files.
   */
  public void logSyncIfNeeded() {
    boolean doSync = false;
    synchronized (this) {
      if (txid > synctxid + maxBufferedTransactions) {
        FSNamesystem.LOG.info("Out of band log sync triggered " +
                              " because there are " +
                              (txid-synctxid) +
                              " buffered transactions which " +
                              " is more than the configured limit of " +
                              maxBufferedTransactions);
        doSync = true;
      }
      if (shouldForceSync()) {
        FSNamesystem.LOG.info("Log sync triggered by the output stream");
        doSync = true;
      }
    }
    if (doSync) {
      logSync();
    }
  }
 
  public void logSync() {
    logSync(true);
  }

  /**
   * Sync all modifications done by this thread. 
   * 
   * The internal concurrency design of this class is as follows:
   * - Log items are written synchronized into an in-memory buffer,
   * and each assigned a transaction ID. 
   * - When a thread (client) would like to sync all of its edits, logSync() 
   * uses a ThreadLocal transaction ID to determine what edit number must
   * be synced to. 
   * - The isSyncRunning volatile boolean tracks whether a sync is currently 
   * under progress. 
   * 
   * The data is double-buffered within each edit log implementation so that 
   * in-memory writing can occur in parallel with the on-disk writing. 
   * 
   * Each sync occurs in three steps:
   * 1. synchronized, it swaps the double buffer and sets the isSyncRunning
   * flag. 
   * 2. unsynchronized, it flushes the data to storage 
   * 3. synchronized, it resets the flag and notifies anyone waiting on the
   * sync. 
   * 
   * The lack of synchronization on step 2 allows other threads to continue
   * to write into the memory buffer while the sync is in progress.
   * Because this step is unsynchronized, actions that need to avoid 
   * concurrency with sync() should be synchronized and also call
   * waitForSyncToFinish() before assuming they are running alone. 
   */
  public void logSync(boolean doWait) {

    long syncStart = 0;   
    boolean thisThreadSuccess = false;
    boolean thisThreadSyncing = false;
    EditLogOutputStream logStream = null;
    try {
      synchronized (this) {

        long mytxid = myTransactionId.get().txid;
        myTransactionId.get().txid = -1L;
        if (mytxid == -1) {
          mytxid = txid;
        }

        printStatistics(false);

        // if somebody is already syncing, then wait
        while (mytxid > synctxid && isSyncRunning) {
          if (!doWait) {
            long delayedId = Server.delayResponse();
            List<Long> responses = delayedSyncs.get(mytxid);
            if (responses == null) {
              responses = new LinkedList<Long>();
              delayedSyncs.put(mytxid, responses);
            }
            responses.add(delayedId);
            return;
          }
          try {
            wait(1000);
          } catch (InterruptedException ie) {
          }
        }

        //
        // If this transaction was already flushed, then nothing to do
        //
        if (mytxid <= synctxid) {
          numTransactionsBatchedInSync++;
          if (metrics != null) // Metrics is non-null only when used inside name
            // node
            metrics.transactionsBatchedInSync.inc();
          return;
        }

        // now, this thread will do the sync
        syncStart = txid;
        isSyncRunning = true;
        thisThreadSyncing = true;

        // swap buffers
        try {
          if (journalSet.isEmpty()) {
            throw new IOException(
                "No journals available to flush, journalset is empty");
          }
          if (editLogStream == null) {
            throw new IOException(
                "No journals available to flush, editlogstream is null");
          }
          editLogStream.setReadyToFlush();         
        } catch (IOException e) {
          LOG.fatal("Could not sync enough journals to persistent storage. "
              + "Unsynced transactions: " + (txid - synctxid), new Exception(e));
          runtime.exit(1);
        }
        // editLogStream may become null,
        // so store a local variable for flush.
        logStream = editLogStream;
      }

      // do the sync
      sync(logStream, syncStart);
      thisThreadSuccess = true;
    } finally {
      synchronized (this) {
        if (thisThreadSyncing) {
          if(thisThreadSuccess) {
            // only set this if the sync succeeded
            synctxid = syncStart;
          }
          // if this thread was syncing, clear isSyncRunning
          isSyncRunning = false;
        }
        this.notifyAll();
      }
    }
    endDelay(syncStart);
  }
 
  private void sync(EditLogOutputStream logStream, long syncStart) {
    // do the sync
    long start = System.nanoTime();
    try {
      if (logStream != null) {
        logStream.flush();
      }
    } catch (IOException ex) {
      synchronized (this) {
        LOG.fatal("Could not sync enough journals to persistent storage. "
            + "Unsynced transactions: " + (txid - synctxid), new Exception());
        runtime.exit(1);
      }
    }
    long elapsed = DFSUtil.getElapsedTimeMicroSeconds(start);
    if (metrics != null) // Metrics is non-null only when used inside name node
      metrics.syncs.inc(elapsed);
  }
 
  private void endDelay(long synced) {
    ConcurrentNavigableMap<Long, List<Long>> syncs = delayedSyncs.headMap(
        synced, true);
    for (Iterator<List<Long>> iter = syncs.values().iterator(); iter.hasNext();) {
      List<Long> responses = iter.next();
      for (Long responseId : responses) {
        try {
          Server.sendDelayedResponse(responseId);
        } catch (IOException ex) {
        }
      }
      iter.remove();
    }
   
  }
 
  private class SyncThread implements Runnable {

    private volatile boolean isRunning = true;

    public void stop() {
      isRunning = false;
    }

    @Override
    public void run() {
      try {
        long syncStart = 0;
        while (isRunning) {
          synchronized (FSEditLog.this) {
            while (isSyncRunning || (isRunning && delayedSyncs.size() == 0)) {
              try {
                FSEditLog.this.wait();
              } catch (InterruptedException iex) {
              }
            }
            if (!isRunning) {
              // Shutting down the edits log
              return;
            }

            // There are delayed transactions waiting to be synced and
            // nobody to sync them
            syncStart = txid;
            isSyncRunning = true;

           
            try {
              if (journalSet.isEmpty()) {
                throw new IOException(
                    "No journals available to flush, journalset is empty");
              }
              if (editLogStream == null) {
                throw new IOException(
                    "No journals available to flush, editlogstream is null");
              }
              editLogStream.flush();
            } catch (IOException ex) {
              synchronized (this) {
                LOG.fatal(
                    "Could not sync enough journals to persistent storage. "
                        + "Unsynced transactions: " + (txid - synctxid),
                    new Exception());
                runtime.exit(1);
              }
            }
          }

          sync(editLogStream, syncStart);
          synchronized (FSEditLog.this) {
            synctxid = syncStart;
            isSyncRunning = false;
            FSEditLog.this.notifyAll();
          }
          endDelay(syncStart);
        }
      } catch (Throwable t) {
        FSNamesystem.LOG.fatal("SyncThread received Runtime exception: ", t);
        Runtime.getRuntime().exit(-1);
      }
    }
   
    public String toString() {
      return "SyncThread";
    }
  }
 
  protected int  checkJournals() throws IOException {
    return journalSet.checkJournals("");
  }
 
  protected void updateNamespaceInfo(StorageInfo si) throws IOException {
    journalSet.updateNamespaceInfo(si);
  }

  //
  // print statistics every 1 minute.
  //
  private void printStatistics(boolean force) {
    long now = FSNamesystem.now();
    if (lastPrintTime + 60000 > now && !force) {
      return;
    }
    lastPrintTime = now;
    StringBuilder buf = new StringBuilder();
    buf.append("Number of transactions: ");
    buf.append(numTransactions);
    buf.append(" Number of transactions batched in Syncs: ");
    buf.append(numTransactionsBatchedInSync);
    buf.append(" Number of syncs: ");
    buf.append(editLogStream != null ? editLogStream.getNumSync() : "null");
    buf.append(" Total time for writing transactions (us): ");
    buf.append(totalTimeTransactions);
    buf.append(" Journal sync times (us): ");
    buf.append(journalSet.getSyncTimes());
   
    FSNamesystem.LOG.info(buf);
  }

  /**
   * Add open lease record to edit log.
   * Records the block locations of the last block.
   */
  public void logOpenFile(String path, INodeFileUnderConstruction newNode
                   throws IOException {
    AddOp op = AddOp.getInstance()
    op.set(newNode.getId(),
        path,
        newNode.getReplication(),
        newNode.getModificationTime(),
        newNode.getAccessTime(),
        newNode.getPreferredBlockSize(),
        newNode.getBlocks(),
        newNode.getPermissionStatus(),
        newNode.getClientName(),
        newNode.getClientMachine());  
    logEdit(op);
  }

  /**
   * Add close lease record to edit log.
   */
  public void logCloseFile(String path, INodeFile newNode) {
    CloseOp op = CloseOp.getInstance();
    op.set(newNode.getId(),
        path,
        newNode.getReplication(),
        newNode.getModificationTime(),
        newNode.getAccessTime(),
        newNode.getPreferredBlockSize(),
        newNode.getBlocks(),
        newNode.getPermissionStatus(),
        null,
        null);
    logEdit(op);
  }
 
  /**
   * Add append file record to the edit log.
   */
  public void logAppendFile(String path, INodeFileUnderConstruction newNode
                   throws IOException {
    AppendOp op = AppendOp.getInstance()
    op.set(path,
        newNode.getBlocks(),
        newNode.getClientName(),
        newNode.getClientMachine());  
    logEdit(op);
  }
 
  /**
   * Add create directory record to edit log
   */
  public void logMkDir(String path, INode newNode) {
    MkdirOp op = MkdirOp.getInstance();
    op.set(newNode.getId(), path, newNode.getModificationTime(),
        newNode.getPermissionStatus());
    logEdit(op);
  }
 
  /**
   * Add hardlink record to edit log
   */
  public void logHardLink(String src, String dst, long timestamp) {
    HardLinkOp op = HardLinkOp.getInstance();
    op.set(src, dst, timestamp);
    logEdit(op);
  }
 
  /**
   * Add rename record to edit log
   */
  public void logRename(String src, String dst, long timestamp) {
    RenameOp op = RenameOp.getInstance();
    op.set(src, dst, timestamp);
    logEdit(op);
  }
 
  /**
   * Add raidFile record to edit log
   */
  public void logRaidFile(String src, String codecId, short expectedSourceRepl,
      long timestamp) {
    //TODO
  }
 
  /**
   * Add set replication record to edit log
   */
  public void logSetReplication(String src, short replication) {
    SetReplicationOp op = SetReplicationOp.getInstance();
    op.set(src, replication);
    logEdit(op);
  }
 
  /** Add set namespace quota record to edit log
   *
   * @param src the string representation of the path to a directory
   * @param quota the directory size limit
   */
  public void logSetQuota(String src, long nsQuota, long dsQuota) {
    SetQuotaOp op = SetQuotaOp.getInstance();
    op.set(src, nsQuota, dsQuota);
    logEdit(op);
  }

  /**  Add set permissions record to edit log */
  public void logSetPermissions(String src, FsPermission permissions) {
    SetPermissionsOp op = SetPermissionsOp.getInstance();
    op.set(src, permissions);
    logEdit(op);
  }

  /**  Add set owner record to edit log */
  public void logSetOwner(String src, String username, String groupname) {
    SetOwnerOp op = SetOwnerOp.getInstance();
    op.set(src, username, groupname);
    logEdit(op);
  }

  /**
   * concat(trg,src..) log
   */
  public void logConcat(String trg, String [] srcs, long timestamp) {
    ConcatDeleteOp op = ConcatDeleteOp.getInstance();
    op.set(trg, srcs, timestamp);
    logEdit(op);
  }
 
  /**
   * Merge(parity, source, ...) log
   * It's used for converting old raided files into new format
   * by merging parity file and source file together into one file
   */
  public void logMerge(String parity, String source, String codecId,
      int[] checksums, long timestamp) {
    MergeOp op = MergeOp.getInstance();
    op.set(parity, source, codecId, checksums, timestamp);
    logEdit(op);
  }
 
  /**
   * Add delete file record to edit log
   */
  public void logDelete(String src, long timestamp) {
    DeleteOp op = DeleteOp.getInstance();
    op.set(src, timestamp);
    logEdit(op);
  }

  /**
   * Add generation stamp record to edit log
   */
  public void logGenerationStamp(long genstamp) {
    SetGenstampOp op = SetGenstampOp.getInstance();
    op.set(genstamp);
    logEdit(op);
  }

  /**
   * Add access time record to edit log
   */
  public void logTimes(String src, long mtime, long atime) {
    TimesOp op = TimesOp.getInstance();
    op.set(src, mtime, atime);
    logEdit(op);
  }

  /**
   * Get all journal streams
   */
  public List<JournalAndStream> getJournals() {
    return journalSet.getAllJournalStreams();
  }

  /**
   * Used only by unit tests.
   */
  public static synchronized void setRuntimeForTesting(Runtime rt) {
    runtime = rt;
  }
 
  /**
   * Return a manifest of what finalized edit logs are available
   */
  public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId)
      throws IOException {
    return journalSet.getEditLogManifest(fromTxId);
  }
 
  /**
   * Finalizes the current edit log and opens a new log segment. 
   * @return the transaction id of the BEGIN_LOG_SEGMENT transaction 
   * in the new log. 
   */
  synchronized long rollEditLog() throws IOException {
    LOG.info("Rolling edit logs.");
    long start = System.nanoTime();
   
    endCurrentLogSegment(true);
    long nextTxId = getLastWrittenTxId() + 1;
    startLogSegment(nextTxId, true);

    assert curSegmentTxId == nextTxId;
    long rollTime = DFSUtil.getElapsedTimeMicroSeconds(start);
    if (metrics != null) {
      metrics.rollEditLogTime.inc(rollTime);
      metrics.tsLastEditsRoll.set(System.currentTimeMillis());
    }
    return nextTxId;
  }
 
    /**
     * Start writing to the log segment with the given txid. 
     * Transitions from BETWEEN_LOG_SEGMENTS state to IN_LOG_SEGMENT state.  
     */
  synchronized void startLogSegment(final long segmentTxId,
      boolean writeHeaderTxn) throws IOException {
    LOG.info("Starting log segment at " + segmentTxId);
    if (segmentTxId < 0) {
      throw new IOException("Bad txid: " + segmentTxId);
    }
    if (state != State.BETWEEN_LOG_SEGMENTS) {
      throw new IOException("Bad state: " + state);
    }
    if (segmentTxId <= curSegmentTxId) {
      throw new IOException("Cannot start writing to log segment "
          + segmentTxId + " when previous log segment started at "
          + curSegmentTxId);
    }
    if (segmentTxId != txid + 1) {
      throw new IOException("Cannot start log segment at txid " + segmentTxId
          + " when next expected " + (txid + 1));
    }

    numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0;

    // TODO no need to link this back to storage anymore!
    // See HDFS-2174.
    storage.attemptRestoreRemovedStorage();
    try {
      editLogStream = journalSet.startLogSegment(segmentTxId);
    } catch (IOException ex) {
      throw new IOException("Unable to start log segment " + segmentTxId
          + ": no journals successfully started.");

    }
    curSegmentTxId = segmentTxId;
    state = State.IN_SEGMENT;
    if (writeHeaderTxn) {
      logEdit(LogSegmentOp.getInstance(FSEditLogOpCodes.OP_START_LOG_SEGMENT));
      logSync();
    }
   
    // force update of journal and image metrics
    journalSet.updateJournalMetrics();
    // If it is configured, we want to schedule an automatic edits roll
    if (timeoutRollEdits > 0) {
      FSNamesystem fsn = this.journalSet.getImage().getFSNamesystem();
      if (fsn != null) {
        // In some test cases fsn is NULL in images. Simply skip the feature.
        AutomaticEditsRoller aer = fsn.automaticEditsRoller;
        if (aer != null) {
          aer.setNextRollTime(System.currentTimeMillis() + timeoutRollEdits);
        } else {
          LOG.warn("Automatic edits roll is enabled but the roller thread "
              + "is not enabled. Should only happen in unit tests.");
        }
      } else {
        LOG.warn("FSNamesystem is NULL in FSEditLog.");
      }
    }
  }
     
  /**
   * Finalize the current log segment. Transitions from IN_SEGMENT state to
   * BETWEEN_LOG_SEGMENTS state.
   */
  synchronized void endCurrentLogSegment(boolean writeEndTxn)
      throws IOException {
    LOG.info("Ending log segment " + curSegmentTxId);
    if (state != State.IN_SEGMENT) {
      throw new IllegalStateException("Bad state: " + state);
    }
    waitForSyncToFinish();
    if (writeEndTxn) {
      logEdit(LogSegmentOp.getInstance(FSEditLogOpCodes.OP_END_LOG_SEGMENT));
    }
    logSyncAll();
    printStatistics(true);
    final long lastTxId = getLastWrittenTxId();

    try {
      journalSet.finalizeLogSegment(curSegmentTxId, lastTxId);
      editLogStream = null;
    } catch (IOException e) {
      // All journals have failed, it will be handled in logSync.
      FSNamesystem.LOG.info("Cannot finalize log segment: " + e.toString());
    }
    state = State.BETWEEN_LOG_SEGMENTS;
  }
   
  /**
   * Archive any log files that are older than the given txid.
   */
  public void purgeLogsOlderThan(final long minTxIdToKeep) {
    synchronized (this) {
      // synchronized to prevent findbugs warning about inconsistent
      // synchronization. This will be JIT-ed out if asserts are
      // off.
      assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this
                                                             // is no-op
          minTxIdToKeep <= curSegmentTxId : "cannot purge logs older than txid "
          + minTxIdToKeep + " when current segment starts at " + curSegmentTxId;
      try {
        journalSet.purgeLogsOlderThan(minTxIdToKeep);
      } catch (IOException ex) {
        // All journals have failed, it will be handled in logSync.
      }

    }
  }
 
    /**
     * The actual sync activity happens while not synchronized on this object.
     * Thus, synchronized activities that require that they are not concurrent
     * with file operations should wait for any running sync to finish.
     */
  synchronized void waitForSyncToFinish() {
    while (isSyncRunning) {
      try {
        wait(1000);
      } catch (InterruptedException ie) {
      }
    }
  }
 
  /**
   * Return the txid of the last synced transaction. For test use only
   */
  synchronized long getSyncTxId() {
    return synctxid;
  }

  /**
   * Run recovery on all journals to recover any unclosed segments
   */
  void recoverUnclosedStreams() {
    try {
      journalSet.recoverUnfinalizedSegments();
    } catch (IOException ex) {
      // All journals have failed, it is handled in logSync.
    }
  }
 
  /**
   * Select a list of input streams to load.
   *
   * @param streams, streams to be returned
   * @param fromTxId first transaction in the selected streams
   * @param toAtLeast the selected streams must contain this transaction
   * @param inProgessOk set to true if in-progress streams are OK
   *
   * @return true if there the redundancy in no met
   */
  public synchronized boolean selectInputStreams(
      Collection<EditLogInputStream> streams, long fromTxId,
      long toAtLeastTxId, int minRedundancy) throws IOException {
   
    // at this point we should not have any non-finalized segments
    // this function is called at startup, and must be invoked after
    // recovering all in progress segments
    if (journalSet.hasUnfinalizedSegments(fromTxId)) {
      LOG.fatal("All streams should be finalized");
      throw new IOException("All streams should be finalized at startup");
    }
   
    // get all finalized streams
    boolean redundancyViolated = journalSet.selectInputStreams(streams,
        fromTxId, false, false, minRedundancy);
   
    try {
      checkForGaps(streams, fromTxId, toAtLeastTxId, true);
    } catch (IOException e) {
      closeAllStreams(streams);
      throw e;
    }
    return redundancyViolated;
  }
 
  /**
   * Check for gaps in the edit log input stream list.
   * Note: we're assuming that the list is sorted and that txid ranges don't
   * overlap.  This could be done better and with more generality with an
   * interval tree.
   */
  private void checkForGaps(Collection<EditLogInputStream> streams, long fromTxId,
      long toAtLeastTxId, boolean inProgressOk) throws IOException {
    Iterator<EditLogInputStream> iter = streams.iterator();
    long txId = fromTxId;
    while (true) {
      if (txId > toAtLeastTxId)
        return;
      if (!iter.hasNext())
        break;
      EditLogInputStream elis = iter.next();
      if (elis.getFirstTxId() > txId) {
        break;
      }
      long next = elis.getLastTxId();
      if (next == HdfsConstants.INVALID_TXID) {
        if (!inProgressOk) {
          throw new RuntimeException("inProgressOk = false, but "
              + "selectInputStreams returned an in-progress edit "
              + "log input stream (" + elis + ")");
        }
        // We don't know where the in-progress stream ends.
        // It could certainly go all the way up to toAtLeastTxId.
        return;
      }
      txId = next + 1;
    }
    throw new IOException(String.format("Gap in transactions. Expected to "
        + "be able to read up until at least txid %d but unable to find any "
        + "edit logs containing txid %d", toAtLeastTxId, txId));
  }

  /**
   * Close all the streams in a collection
   *
   * @param streams The list of streams to close
   */
  static void closeAllStreams(Iterable<EditLogInputStream> streams) {
    for (EditLogInputStream s : streams) {
      IOUtils.closeStream(s);
    }
  }
 
  /**
   * Retrieve the implementation class for a Journal scheme. 
   * @param conf The configuration to retrieve the information from
   * @param uriScheme The uri scheme to look up. 
   * @return the class of the journal implementation 
   * @throws IllegalArgumentException if no class is configured for uri
   */
  static Class<? extends JournalManager> getJournalClass(Configuration conf,
      String uriScheme) {
    String key = "dfs.name.edits.journal-plugin" + "." + uriScheme;
    Class<? extends JournalManager> clazz = null;
    try {
      clazz = conf.getClass(key, null, JournalManager.class);
    } catch (RuntimeException re) {
      throw new IllegalArgumentException("Invalid class specified for "
          + uriScheme, re);
    }

    if (clazz == null) {
      LOG.warn("No class configured for " + uriScheme + ", " + key
          + " is empty");
      throw new IllegalArgumentException("No class configured for " + uriScheme);
    }
    return clazz;
  }
   
  /**
   * Construct a custom journal manager.
   * The class to construct is taken from the configuration.
   * @param uri Uri to construct
   * @return The constructed journal manager
   * @throws IllegalArgumentException if no class is configured for uri
   */
  public static JournalManager createJournal(Configuration conf, URI uri,
      NamespaceInfo nsInfo, NameNodeMetrics metrics) {
    Class<? extends JournalManager> clazz = getJournalClass(conf,
        uri.getScheme());

    try {
      Constructor<? extends JournalManager> cons = clazz.getConstructor(
          Configuration.class, URI.class, NamespaceInfo.class,
          NameNodeMetrics.class);
      return cons.newInstance(conf, uri, nsInfo, metrics);
    } catch (Exception e) {
      throw new IllegalArgumentException("Unable to construct journal, " + uri,
          e);
    }
  }

  // sets the initial capacity of the flush buffer.
  static void setBufferCapacity(int size) {
    sizeFlushBuffer = size;
  }
  //
  // maximum number of transactions to be buffered in memory
  static void setMaxBufferedTransactions(int num) {
    maxBufferedTransactions = num;
  }

  // sets the preallocate trigger of the edits log.
  static void setPreallocateSize(long size) {
    preallocateSize = size;
  }

  /**
   * Return the transaction ID for the transaction that was written last.
   */
  synchronized long getLastWrittenTxId() {
    return txid;
  }
 
  public synchronized long getCurrentTxId() {
    return txid + 1;
  }
 
  synchronized long getLastSyncedTxId() {
    return synctxid;
  }
 
  /**
   * @return the first transaction ID in the current log segment
   */
  public synchronized long getCurSegmentTxId() {
    assert state == State.IN_SEGMENT : "Bad state: " + state;
    return curSegmentTxId;
  }
 
  /**
   * Get number of journals available
   */
  public int getNumberOfAvailableJournals() throws IOException {
    return checkJournals();
  }
 
  /**
   * Get number of journals (enabled and disabled).
   */
  public int getNumberOfJournals() throws IOException {
    return journalSet.getNumberOfJournals();
  }
 
  /**
   * Check if the shared journal is available
   */
  public boolean isSharedJournalAvailable() throws IOException {
    return journalSet.isSharedJournalAvailable();
  }
 
  public void setTimeoutRollEdits(long timeoutRollEdits) {
    this.timeoutRollEdits = timeoutRollEdits;
  }
 
  /**
   * A class to read in blocks stored in the old format. The only two
   * fields in the block were blockid and length.
   */
  static class BlockTwo implements Writable {
    long blkid;
    long len;

    static {                                      // register a ctor
      WritableFactories.setFactory
        (BlockTwo.class,
         new WritableFactory() {
           public Writable newInstance() { return new BlockTwo(); }
         });
    }


    BlockTwo() {
      blkid = 0;
      len = 0;
    }
    /////////////////////////////////////
    // Writable
    /////////////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeLong(blkid);
      out.writeLong(len);
    }

    public void readFields(DataInput in) throws IOException {
      this.blkid = in.readLong();
      this.len = in.readLong();
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.namenode.FSEditLog$BlockTwo

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.