Package org.apache.hadoop.hdfs.server.namenode

Source Code of org.apache.hadoop.hdfs.server.namenode.AvatarNode

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;

import java.io.FilenameFilter;
import java.io.IOException;
import java.io.BufferedOutputStream;
import java.io.BufferedInputStream;
import java.io.DataOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileInputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.UnknownHostException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.text.ParseException;
import java.text.SimpleDateFormat;

import javax.management.NotCompliantMBeanException;
import javax.management.StandardMBean;

import org.apache.hadoop.ipc.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.hdfs.AvatarFailoverSnapshot;
import org.apache.hadoop.hdfs.AvatarZooKeeperClient;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.FileStatusExtended;
import org.apache.hadoop.hdfs.OpenFilesInfo;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hdfs.protocol.AvatarProtocol;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.Avatar;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.StartupOption;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.InstanceId;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.server.protocol.AvatarDatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockFlags;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockReport;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.IncrementalBlockReport;
import org.apache.hadoop.hdfs.server.protocol.ReceivedBlockInfo;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.DatanodeProtocols;
import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
import org.apache.hadoop.hdfs.server.namenode.ClusterJspHelper.NameNodeKey;
import org.apache.hadoop.hdfs.server.namenode.metrics.AvatarNodeStatusMBean;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.hdfs.util.InjectionHandler;
import org.apache.hadoop.hdfs.util.LightWeightBitSet;
import org.apache.zookeeper.data.Stat;

/**
* This is an implementation of the AvatarNode, a hot
* standby for the NameNode.
* This is really cool, believe me!
* The AvatarNode has two avatars.. the Standby avatar and the Active
* avatar.
*
* In the Standby avatar, the AvatarNode is consuming transaction logs
* generated by the primary (via a transaction log stored in a shared device).
* Typically, the primary Namenode is writing transactions to a NFS filesystem
* and the Standby is reading the log from the same NFS filesystem. The
* Standby is also making periodic checkpoints to the primary namenode.
*
* A manual command can switch the AvatarNode from the Standby avatar
* to the Active avatar. In the Active avatar, the AvatarNode performs precisely
* the same functionality as a real usual Namenode. The switching from
* Standby avatar to the Active avatar is fast and can typically occur
* within seconds.
*
* Typically, an adminstrator will run require two shared mount points for
* transaction logs. It has to be set in fs.name.dir.shared0 and
* fs.name.dir.shared1 (similarly for edits). Then the adminstrator starts
* the AvatarNode on two different machines as follows:
*
* bin/hadoop org.apache.hadoop.hdfs.server.namenode.AvatarNode -zero -active
* bin/hadoop org.apache.hadoop.hdfs.server.namenode.AvatarNode -one -standby
* The first  AvatarNode uses  fs.name.dir.shared0 while the second
* AvatarNode uses fs.name.dir.shared1 to write its transaction logs.
* Also, at startup, the first instance is the primary Namenode and the
* second instance is the Standby
*
* After a while, the adminstrator decides to change the avatar of the
* second instance to Active. In this case, he/she has to first ensure that the
* first instance is really really dead. This code does not handle the
* split-brain scenario where there are two active namenodes in one cluster.
*
*/

public class AvatarNode extends NameNode
    implements AvatarProtocol, AvatarNodeStatusMBean {

  static {
    Configuration.addDefaultResource("avatar-default.xml");
    Configuration.addDefaultResource("avatar-site.xml");
  }

  public static final Log LOG = LogFactory.getLog(AvatarNode.class.getName());
  private static final int    INVALIDATES_CLEANUP_INTERVAL = 60 * 1000;
  private static final String STORAGE_FILE_LOCK     = "in_use.lock";
  private static final String EDITSFILE     = "/current/edits";
  private static final String EDITSNEW     = "/current/edits.new";
  private static final String TIMEFILE     = "/current/fstime";
  private static final String IMAGEFILE = "/current/fsimage";
  private static final String IMAGENEW     ="/current/fsimage.ckpt";
  public static final long TXID_IGNORE = -1;
  public static final String FAILOVER_SNAPSHOT_FILE = "failover_snapshot_file";
  static final SimpleDateFormat dateForm =
    new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss.SSS");

  // The instanceId is assigned at startuptime and does not change for
  // the lifetime of the Node. The adminstrator has to name each instance
  // of the AvatarNode with a different instanceId. The node number is used
  // by the AvaterNode to determine which shared devices it should use to
  // checkpoint the image.
  //
  private InstanceId instance;

  // The time when (and if) the fsimage was sync-ed from the remote AvatarNode
  volatile private long startCheckpointTime;

  private Server server;                   /** RPC server */
  private InetSocketAddress serverAddress; /** RPC server address */
  private volatile Avatar currentAvatar;            // the current incarnation of this node
  private Standby standby;                 // the standby object
  private Configuration confg;             // config for the standby namenode
  private Configuration startupConf;       // config for the namenode
  private Thread standbyThread;            // the standby daemon thread
  private InvalidatesCleaner cleaner;      // The thread cleaning up invalidates
  private Thread cleanerThread;

  private RunInfo runInfo;
  private long sessionId;

  private StandbySafeMode standbySafeMode;
  private volatile boolean isInitialized = false;

  protected final boolean enableTestFramework; 
  protected final boolean enableTestFrameworkFsck;
 
  private String failoverFsck = "";

  /**
   * The startup Conf is the original configuration of the AvatarNode. It is used by the
   * secondary namenode to talk to the primary namenode.
   * The conf is the modified configuration that is used by the standby namenode
   */
  AvatarNode(Configuration startupConf, Configuration conf,
             StartupInfo startInfo, RunInfo runInfo, long sessionId) throws IOException {
    super(conf);   
    this.sessionId = sessionId;
    this.runInfo = runInfo;
    this.instance = startInfo.instance;
    this.enableTestFramework =
      (conf.getFloat("dfs.avatarnode.failover.sample.percent", 0.0f) != 0.0f);
    this.enableTestFrameworkFsck =
        (conf.getBoolean("dfs.avatarnode.failover.fsck", false));
   
    // if we are starting as the standby then
    // record the fstime of the checkpoint that we are about to sync from
    if (startInfo.isStandby) {
      // Set the checkpoint time to the fstime of the image and edits
      // that were copied
      setStartCheckpointTime(readLocalFstime(conf));
    }

    initialize(conf);
    currentAvatar = startInfo.isStandby ? Avatar.STANDBY : Avatar.ACTIVE;
    this.startupConf = startupConf;
    this.confg = conf;
    this.nameserviceId = startInfo.serviceName;

    if (currentAvatar == Avatar.STANDBY) {
      // Verify we have the correct safemode.
      SafeModeInfo safeMode = super.namesystem.getSafeModeInstance();
      if (safeMode == null || !(safeMode instanceof StandbySafeMode)) {
        throw new IOException("Invalid safe mode for Standby Avatar : "
            + safeMode + " Standby Avatar should be using "
            + StandbySafeMode.class + " as its dfs.safemode.impl");
      }
      standbySafeMode = (StandbySafeMode) safeMode;

      // Standby has a different property for the max buffered transactions
      // to replay the log faster
      int maxStandbyBufferedTransactions =
        confg.getInt("dfs.max.standby.buffered.transactions",
            HdfsConstants.DEFAULT_MAX_BUFFERED_TRANSACTIONS);
      FSEditLog.setMaxBufferedTransactions(maxStandbyBufferedTransactions);

      // Create a standby object which does the actual work of
      // processing transactions from the primary and checkpointing
      standby = new Standby(this, startupConf, confg);
      standbyThread = new Thread(standby);
      standbyThread.start();
      cleaner = new InvalidatesCleaner();
      cleanerThread = new Thread(cleaner);
      cleanerThread.start();
    }
    isInitialized = true;
  }
 
  protected void setFailoverFsck(String fsck) {
    failoverFsck = fsck;
  }

  /**
   * Generates a new session id for the cluster and writes it to zookeeper. Some
   * other data in zookeeper (like the last transaction id) is written to
   * zookeeper with the sessionId so that we can easily determine in which
   * session was this data written. The sessionId is unique since it uses the
   * current time.
   *
   * @return the session id that it wrote to ZooKeeper
   * @throws IOException
   */
  private static long writeSessionIdToZK(Configuration conf) throws IOException {
    long ssid = -1;
    int maxTries = conf.getInt("dfs.avatarnode.sync.ssidtxid.retries", 3);
    // Whether or not to verify the sessionid after writing it to ZK.
    int tries = 0;
    while (tries < maxTries) {
      AvatarZooKeeperClient zk = new AvatarZooKeeperClient(conf, null);
      try {
        ssid = now();
        zk.registerPrimarySsId(getClusterAddress(conf), ssid);
        // Be extra careful and verify the data was synced to zk.
        Long ssIdInZk = zk.getPrimarySsId(getClusterAddress(conf));
        if (ssid != ssIdInZk) {
          throw new IOException("Session Id in the NameNode : " + ssid +
              " does not match the session Id in Zookeeper : " + ssIdInZk);
        }
        break;
      } catch(Exception e) {
        if (tries == maxTries - 1 ) {
          throw new IOException(e);
        }
      } finally {
        try {
          zk.shutdown();
        } catch (InterruptedException ie) {
          if (tries == maxTries - 1) {
            throw new IOException(ie);
          }
        }
      }
    }
    return ssid;
  }

  /**
   * Wait for the StandbyNode to exit. If it does, then stop the underlying namenode.
   */
  void waitForRestart() {
    if (standbyThread != null) {
      try {
        // if this is the standby avatarnode, then wait for the Standby to exit
        standbyThread.join();
      } catch (InterruptedException ie) {
        //eat it up
      }
      standbyThread = null;
      LOG.info("waitForRestart Standby thread exited.");

      // if we are still in standbymode, that means we need to restart from scratch.
      if (getAvatar() == Avatar.STANDBY) {
        runInfo.isRunning = false;
        LOG.info("waitForRestart Stopping encapsulated namenode.");
        super.stop();            // terminate encapsulated namenode
        super.join();            // wait for encapsulated namenode to exit
        shutdownStandby();
        LOG.info("waitForRestart exiting");
        return;
      }
    }
    super.join();            // wait for encapsulated namenode
  }

  public void registerMBean() {
    StandardMBean avatarNodeBean;
    try {
      avatarNodeBean = new StandardMBean(this, AvatarNodeStatusMBean.class);
      MBeanUtil.registerMBean("AvatarNode", "AvatarNodeState", avatarNodeBean);
    } catch (NotCompliantMBeanException mex) {
      LOG.error("Error registering mbean with JMX", mex);
    }
  }
 
  @Override
  public String getInstance() {
    return this.instance.toString();
  }
 
  @Override
  public String getState() {
    return this.currentAvatar.toString();
  }
 
  @Override
  public long getLagBytes() {
    if (this.standby == null)
    return 0;
    return this.standby.getLagBytes();
  }

  public Configuration getStartupConf() {
    return this.startupConf;
  }

  /**
   * Initialize AvatarNode
   * @param conf the configuration
   */
  private void initialize(Configuration conf) throws IOException {
    InetSocketAddress socAddr = AvatarNode.getAddress(conf);
    int handlerCount = conf.getInt("hdfs.avatarnode.handler.count", 3);

    // create rpc server
    this.server = RPC.getServer(this, socAddr.getHostName(),
                                socAddr.getPort(),
                                handlerCount, false, conf);

    // The rpc-server port can be ephemeral... ensure we have the
    // correct info
    this.serverAddress = this.server.getListenerAddress();
    LOG.info("AvatarNode up at: " + this.serverAddress);
    this.registerMBean();
    this.server.start();
  }

  /**
   * If the specified protocol is AvatarProtocol, then return the
   * AvatarProtocol version id, otherwise delegate to the underlying
   * namenode.
   */
  public long getProtocolVersion(String protocol,
                                 long clientVersion) throws IOException {
    if (protocol.equals(AvatarProtocol.class.getName())) {
      return AvatarProtocol.versionID;
    } else {
      return super.getProtocolVersion(protocol, clientVersion);
    }
  }

  //
  // methods to support Avatar Protocol
  //

  /**
   * @inheritDoc
   */
  public synchronized Avatar getAvatar() {
    return currentAvatar;
  }
 
  /**
   * @inheritDoc
   */
  public Avatar reportAvatar() {
    return currentAvatar;
  }
 
  /**
   * @inheritDoc
   */
  public boolean isInitialized() {
    return isInitialized;
  }

  private static class ShutdownAvatarThread extends Thread {
    private final AvatarNode node;

    public ShutdownAvatarThread(AvatarNode node) {
      this.node = node;
    }

    public void run() {
      try {
        node.runInfo.shutdown = true;
        LOG.info("Shutdown thread for: " + node.currentAvatar + " starting...");
        if (node.currentAvatar == Avatar.STANDBY) {
          // make sure that all transactions are consumed
          try {
            node.standby.quiesce(AvatarNode.TXID_IGNORE);
          } catch (Throwable e) {
            LOG.warn("Standby: ", e);
          }
        }
        // Need to stop RPC threads before capturing any final data about the
        // primary avatar.
        node.stopRPC(false);

        long totalBlocks = node.namesystem.getBlocksTotal();
        String fsck = "";
        try {
          if (node.enableTestFramework
              && node.enableTestFrameworkFsck) {
            LOG.info("Failover: Test framework - running fsck");
            fsck = node.runFailoverFsck();
            LOG.info("Failover: Test framework - fsck done");
          }
        } catch (IOException e) { /*ignore*/ }
       
        // after quiescing all communication and joining
        // all threads, we should still have all streams available
        node.verifyEditStreams();
       
        // stop the node (namesystem, fsimage, editlog, etc.)
        node.stop();
        node.join();            // wait for encapsulated namenode to exit
       
        if (InjectionHandler.falseCondition(InjectionEvent.AVATARNODE_SHUTDOWN)) {
          // simulate crash
          return;
        }
       
        if (node.currentAvatar == Avatar.STANDBY) {
          node.shutdownStandby();
        } else if (node.currentAvatar == Avatar.ACTIVE) {
          // If we are the primary we need to sync our last transaction id to
          // zookeeper.
          node.writeFailoverTestData(fsck);
          node.writeLastTxidToZookeeper(totalBlocks);
        }
      } catch (Exception e) {
        LOG.error("shutdownAvatar() failed", e);
      }
    }
  }

  private void verifyEditStreams() throws IOException {
    if (currentAvatar == Avatar.STANDBY)
      return;
    int expectedEditStreams = FSNamesystem.getNamespaceEditsDirs(confg).size();
    int actualEditStreams = this.namesystem.getFSImage().getEditLog()
        .getNumEditStreams();
    if (expectedEditStreams != actualEditStreams
        || InjectionHandler
            .falseCondition(InjectionEvent.AVATARNODE_CHECKEDITSTREAMS)) {
      String msg = "Failover: Cannot proceed - number of required edit streams: "
          + expectedEditStreams + " current number: " + actualEditStreams;
      LOG.fatal(msg);
      throw new IOException(msg);
    }
  }
 
  /**
   * Shuts down the avatar node
   * @param synchronous - should the function wait for the shutdown to complete
   * @throws IOException
   */
  public synchronized void shutdown(boolean synchronous) throws IOException {
    LOG.info("Shutdown: Asynchronous shutdown for: " + currentAvatar);
   
    if (runInfo.shutdown) {
      LOG.info("Shutdown: Node already shut down");
      return;
    }
    runInfo.shutdown = true;
   
    verifyEditStreams();
    Thread shutdownThread = new ShutdownAvatarThread(this);
    shutdownThread.setName("ShutDown thread for : " + serverAddress);
    shutdownThread.setDaemon(false);
    shutdownThread.start();
   
    if (synchronous) {
      LOG.info("Shutdown: Waiting for shutdown to complete");
      try {
        shutdownThread.join();
      } catch (InterruptedException ie) {
        throw new IOException(ie);
      }
    }
  }
 
  @Override
  public void shutdownAvatar() throws IOException {
    shutdown(false);
  }
 
  /**
   * Used only for testing.
   */
  public void doCheckpoint() throws IOException {
    if (currentAvatar != Avatar.STANDBY) {
      throw new IOException("This is not the standby avatar");
    }
    standby.doCheckpoint();
  }
 
  /**
   * Used only for testing.
   */
  public Standby getStandby() throws IOException {
    if (currentAvatar != Avatar.STANDBY) {
      throw new IOException("This is not the standby avatar");
    }
    return standby;
  }

  public long getSessionId() throws IOException {
    if (currentAvatar != Avatar.ACTIVE) {
      throw new IOException("This is not the primary avatar");
    }
    return this.sessionId;
  }

  /**
   * Used only for testing.
   */
  public void quiesceStandby(long txId) throws IOException {
    if (currentAvatar != Avatar.STANDBY) {
      throw new IOException("This is not the standby avatar");
    }
    standby.quiesce(txId);
  }

  public static String getClusterAddress(Configuration conf)
    throws UnknownHostException {
    InetSocketAddress addr = NameNode.getClientProtocolAddress(conf);
    return addr.getHostName() + ":" + addr.getPort();
  }

  /**
   * Writes the last transaction id of the primary avatarnode to zookeeper.
   *
   * @throws IOException
   */
  private void writeLastTxidToZookeeper(long totalBlocks) throws IOException {
    long lastTxid = super.getLastWrittenTxId();
    LOG.info("Failover - writing lastTxId: " + lastTxid + ", total blocks: " + totalBlocks);
    if (lastTxid < 0) {
      LOG.warn("Invalid last transaction id : " + lastTxid
          + " skipping write to zookeeper.");
      return;
    }
    ZookeeperTxId zkTxid = new ZookeeperTxId(this.sessionId, lastTxid,
        totalBlocks);
    int maxTries = startupConf.getInt("dfs.avatarnode.sync.ssidtxid.retries", 3);
    int tries = 0;
    while (true) {
      AvatarZooKeeperClient zk = new AvatarZooKeeperClient(confg, null);
      try {
        zk.registerLastTxId(getClusterAddress(this.startupConf), zkTxid);
        return;
      } catch (Exception e) {
        if (tries > maxTries) {
          throw new IOException(e);
        } else {
          tries++;
          LOG.warn("Error syncing last txid to zk, retrying ....", e);
          try {
            Thread.sleep(5000);
          } catch (InterruptedException ie) {
            throw new IOException("writeLastTxidToZookeeper() interrupted", ie);
          }
        }
      } finally {
        try {
          zk.shutdown();
        } catch (InterruptedException ie) {
          throw new IOException(ie);
        }
      }
    }
  }

  public void shutdownStandby() {
    standby.shutdown();

    if (server != null) {    // shutdown the AvatarNode
      LOG.info("Stopping avatarnode rpcserver.");
      server.stop();
      try {
        server.join();
      } catch (InterruptedException ie) {
        //eat it up
      }
    }
    if (cleaner != null) {
      // Shut down the cleaner thread as it will keep
      // the process from shutting down
      cleaner.stop();
      cleanerThread.interrupt();
      try {
        cleanerThread.join();
      } catch (InterruptedException iex) {
        Thread.currentThread().interrupt();
      }
    }
  }
 
  /**
   * Stops all RPC threads and ensures that all RPC handlers have exited.
   * Stops all communication to the namenode.
   */
  protected void stopRPC(boolean interruptClientHadlers) throws IOException {
    try {
      super.stopRPC(interruptClientHadlers);
      LOG.info("stopRPC: Stopping avatardatanode server");
      this.server.stop(interruptClientHadlers);
      this.server.waitForHandlers();
    } catch (InterruptedException ex) {
      throw new IOException("stopRPC() interrupted", ex);
    }
  }

  private ZookeeperTxId getLastTransactionId() throws IOException {
    try {
      AvatarZooKeeperClient zk = new AvatarZooKeeperClient(confg, null);
      try {
        // Gather session id and transaction id data.
        String address = getClusterAddress(this.startupConf);
        long sessionId = zk.getPrimarySsId(address);
        ZookeeperTxId zkTxId = zk.getPrimaryLastTxId(address);
        if (sessionId != zkTxId.getSessionId()) {
          throw new IOException("Session Id in the ssid node : " + sessionId
              + " does not match the session Id in the txid node : "
              + zkTxId.getSessionId());
        }
        return zkTxId;
      } finally {
        zk.shutdown();
      }
    } catch (Exception e) {
      throw new IOException(e);
    }
  }

  private void verifyTransactionIds(ZookeeperTxId zkTxId) throws IOException {
    long zkLastTxId = zkTxId.getTransactionId();
    long totalBlocks = zkTxId.getTotalBlocks();
    long lastTxId = super.getLastWrittenTxId();

    // Verify transacation ids.
    if (lastTxId < 0 || zkLastTxId < 0) {
      throw new IOException("Invalid transacation ids, txid in NameNode : "
          + lastTxId + " txid in Zookeeper : " + zkLastTxId);
    } else if (lastTxId != zkLastTxId) {
      throw new IOException("The transacation id in the namenode : "
          + lastTxId + " does not match the transaction id in zookeeper : "
          + zkLastTxId);
    } else if (totalBlocks != super.namesystem.getBlocksTotal()) {
      throw new IOException("Total blocks in ZK : " + totalBlocks
          + " don't match up with total blocks on Standby : "
          + super.namesystem.getBlocksTotal());
    }
  }

  private void registerAddressToZK(AvatarZooKeeperClient zk, String confParam)
      throws IOException {
    String address = startupConf.get(confParam);
    String realAddress = confg.get(confParam);
    if (address != null && realAddress != null) {
      zk.registerPrimary(address, realAddress, true);
    }
  }

  private void registerAsPrimaryToZK() throws IOException {
    // Register client port address.
    String address = getClusterAddress(startupConf);
    String realAddress = getClusterAddress(confg);
    AvatarZooKeeperClient zk = new AvatarZooKeeperClient(confg, null);
    try {
      zk.registerPrimary(address, realAddress, true);

      // Register dn protocol address
      registerAddressToZK(zk, "dfs.namenode.dn-address");

      // Register http address
      registerAddressToZK(zk, "dfs.http.address");

      // Register rpc address
      registerAddressToZK(zk, AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY);
    } finally {
      try {
        zk.shutdown();
      } catch (InterruptedException e) {
        throw new IOException("Could not shutdown zk client", e);
      }
    }
  }

  private File getSnapshotFile(Configuration conf) throws IOException {
    return new File(getRemoteEditsFile(conf).getParentFile(),
        FAILOVER_SNAPSHOT_FILE);
  }

  private void writeFailoverTestData(String fsck) throws IOException {
    if (!enableTestFramework) {
      LOG.info("Failover: Test framework - disabled");
      return;
   
    float samplePercent = confg.getFloat(
        "dfs.avatarnode.failover.sample.percent", 0.05f);
    LOG.info("Failover: Test framework - using " + (100.0*samplePercent)
        + " % sample size");     
    List<FileStatusExtended> stat = super.getRandomFilesSample(samplePercent);
    AvatarFailoverSnapshot snapshot = new AvatarFailoverSnapshot(
        super.namesystem.getOpenFiles(), stat);
    File snapshotFile = new File(
        getSharedEditsFile(confg).getParentFile(), FAILOVER_SNAPSHOT_FILE);
    DataOutputStream out = new DataOutputStream(
        new BufferedOutputStream(new FileOutputStream(snapshotFile)))
    try {
      snapshot.write(out);
      out.writeBoolean(enableTestFrameworkFsck);
      if (enableTestFrameworkFsck) {
        Text.writeString(out, fsck);
      }
    } finally {
      out.close();
   
    LOG.info("Failover: Test framework - saved snapshot file : " + snapshotFile);
  }

  private void verifySnapshotSampledFile(FileStatusExtended file)
      throws IOException {
    FileStatusExtended stat = super.namesystem.getFileInfoExtended(file
        .getPath().toString());
    if (!stat.equals(file)) {
      throw new IOException("Information for file : " + file.getPath()
          + " does not match with information on snapshot file, expected : "
          + file + ", actual : " + stat);
    }
  }

  private void verifyOpenFiles(OpenFilesInfo openFilesInfo) throws IOException {
    if (openFilesInfo.getGenStamp() != super.namesystem.getGenerationStamp()) {
      throw new IOException(
          "GS on snapshot file doesn't match with GS on node : "
              + openFilesInfo.getGenStamp() + ", "
              + super.namesystem.getGenerationStamp());
    }
    for (FileStatusExtended stat : openFilesInfo.getOpenFiles()) {
      verifySnapshotSampledFile(stat);
    }
  }

  private String verifyFailoverTestData() throws IOException {
    if (!enableTestFramework) {
      LOG.info("Failover: Test framework - disabled");
      return "";
    }
    String fsck = "";
    LOG.info("Failover: Test framework - verification - starting...");
    AvatarFailoverSnapshot snapshot = new AvatarFailoverSnapshot();
    File snapshotFile = getSnapshotFile(confg);
    DataInputStream in = new DataInputStream(
        new BufferedInputStream(new FileInputStream(snapshotFile)));
    try {
      snapshot.readFields(in);
      if (in.readBoolean()) {
        LOG.info("Failover: Test framework - found fsck data");
        fsck = Text.readString(in);
      }
    } finally {
      in.close();
    }
   
    LOG.info("Failover: Test framework - verifying open files: found "
        + snapshot.getOpenFilesInfo().getOpenFiles().size()
        + " files in the test snapshot")
    verifyOpenFiles(snapshot.getOpenFilesInfo());
   
    LOG.info("Failover: Test framework - verifying closed files: found "
        + snapshot.getSampledFiles().size()
        + " files in the test snapshot");   
    for (FileStatusExtended stat : snapshot.getSampledFiles()) {
      verifySnapshotSampledFile(stat);
    }

    LOG.info("Failover: Test framework - verification - succeeded");
    return fsck;
  }
 
  protected String runFailoverFsck() throws IOException {
    Map<String, String[]> pmap = new HashMap<String, String[]>();
    pmap.put("path", new String[] {"/"});
   
    // run fsck
    StringWriter stringWriter = new StringWriter();
    NamenodeFsck fscker = new NamenodeFsck(confg, this,
        pmap, new PrintWriter(stringWriter));
    fscker.fsck();
    return stringWriter.toString();
  }

  /**
   * @inheritDoc
   */
  @Override
  public synchronized void setAvatar(Avatar avatar) throws IOException {
    setAvatar(avatar, false);
  }

  /**
   * @inheritDoc
   */
  public synchronized void setAvatar(Avatar avatar, boolean force)
      throws IOException {
    if (avatar == currentAvatar) {
      LOG.info("Failover: Trying to change avatar to " + avatar +
               " but am already in that state.");
      return;
    }
    if (avatar == Avatar.STANDBY) {   // ACTIVE to STANDBY
      String msg = "Failover: Changing state from active to standby is not allowed." +
                   "If you really want to pause your primary, put it in safemode.";
      LOG.warn(msg);
      throw new IOException(msg);
    } else {                  // STANDBY to ACTIVE
      // Check to see if the primary is somehow checkpointing itself. If so, then
      // refuse to switch to active mode. This check is not foolproof but is a
      // defensive mechanism to prevent administrator errors.
      try {
        if (!zkIsEmpty()) {
          throw new IOException("Can't switch the AvatarNode to primary since " +
              "zookeeper record is not clean. Either use shutdownAvatar to kill " +
              "the current primary and clean the ZooKeeper entry, " +
              "or clear out the ZooKeeper entry if the primary is dead");
        }
      } catch (Exception ex) {
        throw new IOException("Cancelling setAvatar because of Exception", ex);
      }
      if (standby.hasStaleCheckpoint()) {
        String msg = "Failover: Failed to change avatar from " + currentAvatar +
                     " to " + avatar +
                     " because the Standby has not yet consumed all transactions.";
        LOG.warn(msg);
        throw new IOException(msg);
      }

      InjectionHandler
          .processEvent(InjectionEvent.AVATARNODE_AFTER_STALE_CHECKPOINT_CHECK);

      ZookeeperTxId zkTxId = null;
      if (!force) {
        zkTxId = getLastTransactionId();
        standby.quiesce(zkTxId.getTransactionId());
      } else {
        standby.quiesce(TXID_IGNORE);
      }
      cleaner.stop();
      cleanerThread.interrupt();
      try {
        cleanerThread.join();
      } catch (InterruptedException iex) {
        Thread.currentThread().interrupt();
      }

      String oldPrimaryFsck = null;
      if (!force) {
        verifyTransactionIds(zkTxId);
        oldPrimaryFsck = verifyFailoverTestData();
      }

      // change the value to the one for the primary
      int maxStandbyBufferedTransactions = confg.getInt(
          "dfs.max.buffered.transactions",
          HdfsConstants.DEFAULT_MAX_BUFFERED_TRANSACTIONS);
      FSEditLog.setMaxBufferedTransactions(maxStandbyBufferedTransactions);

      // Clear up deletion queue.
      clearInvalidates();

      standbySafeMode.triggerFailover();

      this.registerAsPrimaryToZK();

      sessionId = writeSessionIdToZK(this.startupConf);
      LOG.info("Failover: Changed avatar from " + currentAvatar + " to " + avatar);
      if (enableTestFramework && enableTestFrameworkFsck && !force) {
        if (!failoverFsck.equals(oldPrimaryFsck)) {
          LOG.warn("Failover: FSCK on old primary and new primary do not match");
          LOG.info("----- FSCK ----- OLD BEGIN");
          LOG.info("Failover: Old primary fsck: \n " + oldPrimaryFsck + "\n");
          LOG.info("----- FSCK ----- NEW BEGIN");
          LOG.info("Failover: New primary fsck: \n " + failoverFsck + "\n");
          LOG.info("----- FSCK ----- END");
        } else {
          LOG.info("Failover: Verified fsck.");
        }
      }
     
      currentAvatar = avatar;
      // Setting safe mode to null here so that we don't throw NPE in
      // getNameNodeSpecificKeys().
      standbySafeMode = null;
      confg.setClass("dfs.safemode.impl", NameNodeSafeModeInfo.class,
          SafeModeInfo.class);
    }
  }

  /*
   * As the AvatarNode is running in Standby mode it fills up
   * invalidates queues for each datanode with blocks it
   * assumes have to be deleted. This information is not
   * entirely accurate and fills up memory as well as leads
   * to dataloss since those queues are flushed to the datanodes
   * on failover and valid blocks may be deleted.
   *
   * To help prevent filling up the memory we clear these queues
   * periodically. And we do a final cleanup jsut before switching
   * to primary.
   */
  private class InvalidatesCleaner implements Runnable {

    volatile boolean running = true;

    @Override
    public void run() {
      while (running) {
        clearInvalidates();
        try {
          Thread.sleep(INVALIDATES_CLEANUP_INTERVAL);
        } catch (InterruptedException iex) {
          if (running == false)
            return;
          Thread.currentThread().interrupt();
        }
      }
    }

    public void stop() {
      running = false;
    }
   
  }
 
  private void clearInvalidates() {
    try {
      DatanodeInfo[] nodes = super.getDatanodeReport(DatanodeReportType.ALL);
      assert namesystem.isInSafeMode();
      super.namesystem.writeLock();
      try {
        for (DatanodeInfo node : nodes) {
          super.namesystem.removeFromInvalidates(node.getStorageID());
        }
      } finally {
        super.namesystem.writeUnlock();
      }
    } catch (IOException e) {
      e.printStackTrace();
    }

  }

  private boolean ignoreDatanodes() {
    return currentAvatar == Avatar.STANDBY &&
            (standby == null
            || standby.fellBehind()
            || InjectionHandler
              .falseCondition(InjectionEvent.STANDBY_FELL_BEHIND));
  }

  @Override
  public void primaryCleared(DatanodeRegistration registration) {
    LOG.info("Received primaryCleared() from : " + registration);
    if (standbySafeMode != null) {
      standbySafeMode.reportPrimaryCleared(registration);
    }
  }

  public DatanodeCommand[] sendHeartbeatNew(DatanodeRegistration registration,
                                       long capacity,
                                       long dfsUsed, long remaining,
                                       long namespaceUsed,
                                       int xmitsInProgress,
                                       int xceiverCount) throws IOException {
    DatanodeCommand[] cmds = super.sendHeartbeat(
            registration, capacity, dfsUsed, remaining, namespaceUsed,
            xmitsInProgress, xceiverCount);

    if (standbySafeMode != null
        && standbySafeMode.reportHeartBeat(registration)) {
      LOG.info("Sending Clear Primary command to : " + registration);
      DatanodeCommand fDelCmd = AvatarDatanodeCommand.CLEARPRIMARY;
      if (cmds == null) {
        return new DatanodeCommand[] { fDelCmd };
      } else {
        DatanodeCommand[] newCmds = Arrays.copyOf(cmds, cmds.length + 1);
        newCmds[cmds.length] = fDelCmd;
        return newCmds;
      }
    } else if (ignoreDatanodes()) {
      if (cmds == null) {
        return new DatanodeCommand[]{AvatarDatanodeCommand.BACKOFF};
      } else {
        DatanodeCommand[] newCmds = Arrays.copyOf(cmds, cmds.length+1);
        newCmds[cmds.length] = AvatarDatanodeCommand.BACKOFF;
        return newCmds;
      }
    } else {
      return cmds;
    }
  }
 
  @Override
  /**
   * Determines whether or not the datanode should retry blocks if they are
   * not present in the blocks map.
   */
  public boolean shouldRetryAbsentBlocks() {
    return (currentAvatar == Avatar.STANDBY);
  }

  @Override
  /**
   * Determines whether or not the given block should be retried by the datanode
   * if its not present in the blocksMap.
   */
  public boolean shouldRetryAbsentBlock(Block block) {
    // If this block does not belong to anyfile and its GS
    // is no less than the avatar node's GS,
    // AvatarNode may not consume the file/block creation edit log yet,
    // so adding it to the retry list.
    return (currentAvatar == Avatar.STANDBY &&
        (!namesystem.getPersistBlocks() ||
         block.getGenerationStamp() >= namesystem.getGenerationStamp()));
  }

  public DatanodeCommand blockReportNew(DatanodeRegistration nodeReg, BlockReport rep) throws IOException {
    if (runInfo.shutdown || !runInfo.isRunning) {
      return null;
    }
    if (ignoreDatanodes()) {
      LOG.info("Standby fell behind. Telling " + nodeReg.toString() +
                " to back off");
      // Do not process block reports yet as the ingest thread is catching up
      return AvatarDatanodeCommand.BACKOFF;
    }

    if (currentAvatar == Avatar.STANDBY) {
      Collection<Block> failed = super.blockReportWithRetries(nodeReg, rep);

      BlockCommand bCmd = new BlockCommand(DatanodeProtocols.DNA_RETRY,
          failed.toArray(new Block[failed.size()]));
      return bCmd;
    } else {
      return super.blockReport(nodeReg, rep);
    }
  }

  /**
   * @inheritDoc
   */
  public Block[] blockReceivedAndDeletedNew(DatanodeRegistration nodeReg,
                              Block blocksReceivedAndDeleted[]) throws IOException {
    if (runInfo.shutdown || !runInfo.isRunning) {
      // Do not attempt to process blocks when
      // the namenode is not running
      return new ReceivedBlockInfo[0];
    }
    if (ignoreDatanodes()) {
      LOG.info("Standby fell behind. Telling " + nodeReg.toString() +
      " to retry incremental block report of " + blocksReceivedAndDeleted.length
      + " blocks later.");
      return blocksReceivedAndDeleted;
    }
    List<Block> failed = new ArrayList<Block>();
    HashSet<Long> failedIds;
    if (currentAvatar == Avatar.STANDBY) {
      failedIds = new HashSet<Long>();
      namesystem.writeLock();
      try {
        for (int index = 0; index < blocksReceivedAndDeleted.length; index++) {
          Block blockRD = blocksReceivedAndDeleted[index];
          if(failedIds.contains(blockRD.getBlockId())){
            // check if there was no other blocking failed request
            blocksReceivedAndDeleted[index] = null;
            failed.add(blockRD);
            continue;
          }
          BlockInfo storedBlock = namesystem.blocksMap.getStoredBlock(blockRD);
          if (!DFSUtil.isDeleted(blockRD) && (storedBlock == null) &&
              (!namesystem.getPersistBlocks() ||
              blockRD.getGenerationStamp() >= namesystem.getGenerationStamp())) {
            // If this block does not belong to anyfile and its GS
            // is no less than the avatar node's GS,
            // AvatarNode may not consume the file/block creation edit log yet,
            // so adding it to the failed list.
            // - do not process any requestes for blocks with the same block id
            // (also add them to the failed list.
            // - do not block other requests
            blocksReceivedAndDeleted[index] = null;
            failed.add(blockRD);
            failedIds.add(blockRD.getBlockId());
          }
        }
      } finally {
        namesystem.writeUnlock();
        if (!failed.isEmpty()) {
          LOG.info("*BLOCK* NameNode.blockReceivedAndDeleted: "
            + "from " + nodeReg.getName() + " has to retry "
            + failed.size() + " blocks.");
        }
        for (Block blockRD : failed) {
          LOG.info("blockReceivedDeleted " + (DFSUtil.isDeleted(blockRD) ? "DELETED" : "RECEIVED")
              + " request received for "
              + blockRD + " on " + nodeReg.getName() + " size "
              + blockRD.getNumBytes()
              + " But it does not belong to any file." + " Retry later.");
        }
      }
    }
    super.blockReceivedAndDeleted(nodeReg, blocksReceivedAndDeleted);
    return failed.toArray(new Block[failed.size()]);
  }

  /**
   * @inheritDoc
   */
  public long[] blockReceivedAndDeletedNew(DatanodeRegistration nodeReg,
        IncrementalBlockReport receivedAndDeletedBlocks) throws IOException {
    long[] failedMap = null;
    if (runInfo.shutdown || !runInfo.isRunning) {
      // Do not attempt to process blocks when
      // the namenode is not running
      if (currentAvatar == Avatar.STANDBY) {
        return new long[0];
      } else {
        return null;
      }
    }
    HashSet<Long> failedIds;
    if (currentAvatar == Avatar.STANDBY) {
      int noAck = receivedAndDeletedBlocks.getLength();
     
      // retry all block if the standby is behind consuming edits
      if (ignoreDatanodes()) {
        LOG.info("Standby fell behind. Telling " + nodeReg.toString() +
        " to retry incremental block report of " + noAck
        + " blocks later.");
        failedMap = LightWeightBitSet.getBitSet(noAck);
        for (int i = 0; i < noAck; i++)
          LightWeightBitSet.set(failedMap, i);
        return failedMap;
      }
     
      Block blockRD = new Block();
      failedIds = new HashSet<Long>();
      failedMap = LightWeightBitSet.getBitSet(noAck);
      namesystem.writeLock();
      try {
        receivedAndDeletedBlocks.resetIterator();
        for (int currentBlock = 0; currentBlock < noAck; currentBlock++) {
          receivedAndDeletedBlocks.getNext(blockRD);
          if(failedIds.contains(blockRD.getBlockId())){
            // check if there was no other blocking failed request
            blockRD.setNumBytes(BlockFlags.IGNORE);
            receivedAndDeletedBlocks.setBlock(blockRD, currentBlock);
            LightWeightBitSet.set(failedMap, currentBlock);
            continue;
          }
          BlockInfo storedBlock = namesystem.blocksMap.getStoredBlock(blockRD);
          if (!DFSUtil.isDeleted(blockRD) && (storedBlock == null) &&
              (!namesystem.getPersistBlocks() ||
              blockRD.getGenerationStamp() >= namesystem.getGenerationStamp())) {
            // If this block does not belong to anyfile and its GS
            // is no less than the avatar node's GS,
            // AvatarNode may not consume the file/block creation edit log yet,
            // so adding it to the failed list.
            // - do not process any requestes for blocks with the same block id
            // (also add them to the failed list.
            // - do not block other requests
            blockRD.setNumBytes(BlockFlags.IGNORE);
            receivedAndDeletedBlocks.setBlock(blockRD, currentBlock);
            LightWeightBitSet.set(failedMap, currentBlock);
            failedIds.add(blockRD.getBlockId());
          }
        }
      } finally {
        namesystem.writeUnlock();
        if (failedMap != null && LightWeightBitSet.cardinality(failedMap) != 0) {
          LOG.info("*BLOCK* NameNode.blockReceivedAndDeleted: "
            + "from " + nodeReg.getName() + " has to retry "
            + LightWeightBitSet.cardinality(failedMap) + " blocks.");
        }
        receivedAndDeletedBlocks.resetIterator();
        for (int currentBlock = 0; currentBlock < noAck; currentBlock++) {
          receivedAndDeletedBlocks.getNext(blockRD);
          if (!LightWeightBitSet.get(failedMap, currentBlock))
            continue;
          LOG.info("blockReceivedDeleted " + (DFSUtil.isDeleted(blockRD) ? "DELETED" : "RECEIVED")
              + " request received for "
              + blockRD + " on " + nodeReg.getName() + " size "
              + blockRD.getNumBytes()
              + " But it does not belong to any file." + " Retry later.");
        }
      }
    }
    super.blockReceivedAndDeleted(nodeReg, receivedAndDeletedBlocks);
    return failedMap;
  }

  /**
   * Returns the hostname:port for the AvatarNode. The default
   * port for the AvatarNode is one more than the port of the
   * underlying namenode.
   */
  public static InetSocketAddress getAddress(Configuration conf) {
    InetSocketAddress u = NameNode.getAddress(conf);
    int port = conf.getInt(AvatarNode.DFS_AVATARNODE_PORT_KEY, u.getPort() + 1);
    return new InetSocketAddress(u.getHostName(), port);
  }

/**
  * Help message for a user
  */
  private static void printUsage() {
    System.err.println(
      "Usage: java AvatarNode [" +
      StartupOption.STANDBY.getName() + "] | [" +
      StartupOption.NODEZERO.getName() + "] | [" +
      StartupOption.NODEONE.getName() + "] | [" +
      StartupOption.FORMAT.getName() + "] | [" +
      StartupOption.UPGRADE.getName() + "] | [" +
      StartupOption.ROLLBACK.getName() + "] | [" +
      StartupOption.FINALIZE.getName() + "] | [" +
      StartupOption.IMPORT.getName() + "]");
  }

  /**
   * validates command line arguments
   */
  static void validateStartupOptions(StartupInfo startInfo) throws IOException {
    // sync cannot be specified along with format or finalize
    if (startInfo.isStandby) {
      if (startInfo.startOpt == StartupOption.FORMAT ||
          startInfo.startOpt == StartupOption.FINALIZE ||
          startInfo.startOpt == StartupOption.ROLLBACK ||
          startInfo.startOpt == StartupOption.UPGRADE) {
        throw new IOException("Standby avatar node cannot be started with " +
          startInfo.startOpt + " option.");
      }
    }
  }

  private static class StartupInfo {
    StartupOption startOpt;
    InstanceId instance;
    boolean isStandby;
    String serviceName;
   
    public StartupInfo(StartupOption startOpt, InstanceId instance,
                       boolean isStandby, String serviceName) {
      this.startOpt = startOpt;
      this.instance = instance;
      this.isStandby = isStandby;
      this.serviceName = serviceName;
    }
  }

  /**
   * Analyze the command line options
   */
  private static StartupInfo parseArguments(String args[]) {
    InstanceId instance = InstanceId.NODEZERO;
    StartupOption startOpt = StartupOption.REGULAR;
    boolean isStandby= false;
    String serviceName = null;
    int argsLen = (args == null) ? 0 : args.length;
    for (int i=0; i < argsLen; i++) {
      String cmd = args[i];
      if (StartupOption.SERVICE.getName().equalsIgnoreCase(cmd)) {
        if (++i < argsLen) {
          serviceName = args[i];
        } else {
          return null;
        }
      } else if (StartupOption.STANDBY.getName().equalsIgnoreCase(cmd)) {
        isStandby = true;
      } else if (StartupOption.NODEZERO.getName().equalsIgnoreCase(cmd)) {
        instance = InstanceId.NODEZERO;
      } else if (StartupOption.NODEONE.getName().equalsIgnoreCase(cmd)) {
        instance = InstanceId.NODEONE;
      } else if (StartupOption.FORMAT.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.FORMAT;
      } else if (StartupOption.FORMATFORCE.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.FORMATFORCE;
      } else if (StartupOption.REGULAR.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.REGULAR;
      } else if (StartupOption.UPGRADE.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.UPGRADE;
      } else if (StartupOption.ROLLBACK.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.ROLLBACK;
      } else if (StartupOption.FINALIZE.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.FINALIZE;
      } else if (StartupOption.IMPORT.getName().equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.IMPORT;
      } else {
        return null;
      }
    }
    return new StartupInfo(startOpt, instance, isStandby, serviceName);
  }

  /**
   * Records the startup command in the configuration
   */
  private static void setStartupOption(Configuration conf, StartupOption opt) {
    conf.set("dfs.namenode.startup", opt.toString());
  }

  public static AvatarNode createAvatarNode(String argv[],
                                    Configuration conf) throws IOException {
    return createAvatarNode(argv, conf, new RunInfo());
  }

  /**
   * HDFS federation configuration that is specific to a name service.
   * This keys are suffixed with nameserviceId in the configuration. For example,
   * "dfs.namenode.rpc-address.nameservice1".</li>
   * </ol>
   *
   * Following are nameservice specific keys.
   */
  final private static String DFS_AVATARNODE_PORT_KEY = "dfs.avatarnode.port";
  final private static String DFS_SHARED_NAME_DIR0_KEY = "dfs.name.dir.shared0";
  final private static String DFS_SHARED_NAME_DIR1_KEY = "dfs.name.dir.shared1";
  final private static String DFS_SHARED_EDITS_DIR0_KEY =
    "dfs.name.edits.dir.shared0";
  final private static String DFS_SHARED_EDITS_DIR1_KEY =
    "dfs.name.edits.dir.shared1";
  final private static String ZERO = "0";
  final private static String ONE = "1";
  final public static String DFS_NAMENODE_RPC_ADDRESS0_KEY =
    DFS_NAMENODE_RPC_ADDRESS_KEY+ZERO;
  final public static String DFS_NAMENODE_RPC_ADDRESS1_KEY =
    DFS_NAMENODE_RPC_ADDRESS_KEY+ONE;

  public static final String[] AVATARSERVICE_SPECIFIC_KEYS = {                                   
    DFS_AVATARNODE_PORT_KEY,
    DFS_NAMENODE_RPC_ADDRESS0_KEY,
    DFS_NAMENODE_RPC_ADDRESS1_KEY,
    DATANODE_PROTOCOL_ADDRESS+ZERO,
    DATANODE_PROTOCOL_ADDRESS+ONE,
    DFS_NAMENODE_HTTP_ADDRESS_KEY+ZERO,
    DFS_NAMENODE_HTTP_ADDRESS_KEY+ONE,
  };

  /** 
   * In federation configuration is set for a set of
   * avartanodes, namenodes etc, which are
   * grouped under a logical nameservice ID. The configuration keys specific
   * to them have suffix set to configured nameserviceId.
   *
   * This method copies the value from specific key of format key.nameserviceId
   * to key, to set up the generic configuration. Once this is done, only
   * generic version of the configuration is read in rest of the code, for
   * backward compatibility and simpler code changes.
   *
   * @param conf
   *          Configuration object to lookup specific key and to set the value
   *          to the key passed. Note the conf object is modified
   * @see DFSUtil#setGenericConf(Configuration, String, String...)
   */
  public static void initializeGenericKeys(Configuration conf, String serviceKey) {                               
    if ((serviceKey == null) || serviceKey.isEmpty()) {
      return;
    }   
    NameNode.initializeGenericKeys(conf, serviceKey);
   
    // adjust meta directory names for this service
    adjustMetaDirectoryNames(conf, serviceKey);
   
    DFSUtil.setGenericConf(conf, serviceKey, AVATARSERVICE_SPECIFIC_KEYS);   
  }
 
  /** Append service name to each avatar meta directory name
   *
   * @param conf configuration of NameNode
   * @param serviceKey the non-empty name of the name node service
   */
  protected static void adjustMetaDirectoryNames(Configuration conf, String serviceKey) {
    adjustMetaDirectoryName(conf, DFS_SHARED_NAME_DIR0_KEY, serviceKey);
    adjustMetaDirectoryName(conf, DFS_SHARED_NAME_DIR1_KEY, serviceKey);
    adjustMetaDirectoryName(conf, DFS_SHARED_EDITS_DIR0_KEY, serviceKey);
    adjustMetaDirectoryName(conf, DFS_SHARED_EDITS_DIR1_KEY, serviceKey);
  }

  /**
   * Tries to bind to the address specified in ZooKeeper, this will always fail
   * if the primary is alive either on the same machine or on a remote machine.
   */
  private static void isPrimaryAlive(String zkRegistry) throws Exception {
    String parts[] = zkRegistry.split(":");
    if (parts.length != 2) {
      throw new IllegalArgumentException("Invalid Address : " + zkRegistry);
    }
    String host = parts[0];
    int port = Integer.parseInt(parts[1]);
    InetSocketAddress clientSocket = new InetSocketAddress(host, port);
    ServerSocket socket = new ServerSocket();
    socket.bind(clientSocket);
    socket.close();
  }

  public static AvatarNode createAvatarNode(String argv[],
                                            Configuration conf,
                                            RunInfo runInfo) throws IOException {
    if (conf == null) {
      conf = new Configuration();
    }
    Configuration startupConf = conf;   // save configuration at startup
    StartupInfo startInfo = parseArguments(argv);
    StartupOption startOpt = startInfo.startOpt;
    if (startOpt == null) {
      printUsage();
      return null;
    }
    if (!validateServiceName(conf, startInfo.serviceName)) {
      return null;
    }
   
    initializeGenericKeys(conf, startInfo.serviceName);
    setStartupOption(conf, startOpt);
    conf = updateAddressConf(conf, startInfo.instance);
    NameNode.setupDefaultURI(conf);

    // sync cannot be specified along with format or finalize
    validateStartupOptions(startInfo);
   
    // We need to check the zookeeper so that the node starting as active
    // is the one registered with the zookeeper
    // and if the node is starting as standby there has to be a master
    // already so that the node doesn't move the log and the image
    InetSocketAddress defaultAddr = NameNode.getClientProtocolAddress(startupConf);
    String fsname = defaultAddr.getHostName() + ":" + defaultAddr.getPort();
    InetSocketAddress actualAddr = NameNode.getClientProtocolAddress(conf);
    String actualName = actualAddr.getHostName() + ":" + actualAddr.getPort();


    AvatarZooKeeperClient zk = new AvatarZooKeeperClient(conf, null);
    boolean zkRegistryMatch = true;
    boolean primaryPresent = false;
    String errorMsg = null;
    try {
      Stat stat = new Stat();
      String zkRegistry = zk.getPrimaryAvatarAddress(fsname, stat, false);
      if (zkRegistry == null) {
        // The registry is empty. Usually this means failover is in progress
        // we need to manually fix it before starting primary
        errorMsg = "A zNode that indicates the primary is empty. "
            + "AvatarNode can only be started as primary if it "
            + "is registered as primary with ZooKeeper";
        zkRegistryMatch = false;
      } else {
        primaryPresent = true;
        if (!zkRegistry.equalsIgnoreCase(actualName)) {
          zkRegistryMatch = false;
          errorMsg = "Registration information in ZooKeeper doesn't "
              + "match the address of this node. AvatarNode can "
              + "only be started as primary if it is registered as "
              + "primary with ZooKeeper. zkRegistry = " + zkRegistry
              + ", actual name = " + actualName;
        }
      }
      if (!startInfo.isStandby) {
        isPrimaryAlive(zkRegistry);
      }
    } catch (Exception e) {
      LOG.error("Got Exception reading primary node registration "
          + "from ZooKeeper. Aborting the start", e);
      zkRegistryMatch = false;

    } finally {
      try {
        zk.shutdown();
      } catch (InterruptedException e) {
        LOG.error("Error shutting down ZooKeeper client", e);
      }
    }
    if (!zkRegistryMatch && !startInfo.isStandby) {
      LOG.error(errorMsg);
      throw new IOException("Cannot start this AvatarNode as Primary.");
    }
    if (!primaryPresent && startInfo.isStandby) {
      throw new IOException("Cannot start Standby since the " +
          "primary is unknown");
    }

    long ssid = 0;
    // We are the primary avatar, write session Id to ZK.
    if (zkRegistryMatch && !startInfo.isStandby) {
      ssid = writeSessionIdToZK(startupConf);
    }

    // If sync is requested, then we copy only the fsimage
    //  (and not the transaction logs) from the other node.
    // If we are NODEONE, then modify the configuration to
    // set fs.name.dir, fs.default.name and dfs.http.address.
    //
    conf = copyFsImage(conf, startInfo);

    // namenode options.
    switch (startOpt) {
      case FORMAT:
        boolean aborted = format(conf, true);
        System.exit(aborted ? 1 : 0);
      case FORMATFORCE:
        aborted = format(conf, false);
        return null;
      case FINALIZE:
        aborted = finalize(conf, true);
        System.exit(aborted ? 1 : 0);
      default:
    }

   
    // We need to put the Namenode into safemode as soon as it starts up.
    // There is a race condition, where before the Standby AvatarNode can put
    // the NameNode into safemode, the NameNode might leave safemode. This could
    // occur in the case of a start where the FSImage and FSEdits are empty
    // and hence the NameNode doesn't wait at all in safemode.
    if (startInfo.isStandby) {
      conf.setClass("dfs.safemode.impl", StandbySafeMode.class,
          SafeModeInfo.class);
    }
    // set persisting blocks to be true
    conf.setBoolean("dfs.persist.blocks", true);
   
    return new AvatarNode(startupConf, conf,
                          startInfo, runInfo, ssid);
  }
 
  private boolean zkIsEmpty() throws Exception {
      InetSocketAddress defaultAddr = NameNode.getClientProtocolAddress(startupConf);
      String fsname = defaultAddr.getHostName() + ":" + defaultAddr.getPort();

      AvatarZooKeeperClient zk =
        new AvatarZooKeeperClient(this.confg, null);
      try {
        Stat stat = new Stat();
        String zkRegistry = zk.getPrimaryAvatarAddress(fsname, stat, false);
        return zkRegistry == null;
      } catch (Exception e) {
        LOG.error("Got Exception reading primary node registration " +
            "from ZooKeeper.", e);
        throw e;
      } finally {
        try {
          zk.shutdown();
        } catch (InterruptedException e) {
          LOG.error("Error shutting down ZooKeeper client", e);
        }
      }
  }
 
  static void copyFiles(FileSystem fs, File src,
      File dest, Configuration conf) throws IOException {
    int MAX_ATTEMPT = 3;
    for (int i = 0; i < MAX_ATTEMPT; i++) {
      try {
        String mdate = dateForm.format(new Date(now()));
        if (dest.exists()) {
          File tmp = new File (dest + File.pathSeparator + mdate);
          if (!dest.renameTo(tmp)) {
            throw new IOException("Unable to rename " + dest +
                                  " to " +  tmp);
          }
          cleanupBackup(conf, dest);
          LOG.info("Moved aside " + dest + " as " + tmp);
        }
        if (!FileUtil.copy(fs, new Path(src.toString()),
                          fs, new Path(dest.toString()),
                          false, conf)) {
          String msg = "Error copying " + src + " to " + dest;
          throw new IOException(msg);
        }
        LOG.info("Copied " + src + " into " + dest);
        return;
      } catch (IOException e) {
        if (i == MAX_ATTEMPT - 1) {
          LOG.error(e);
          throw e;
        }
        try {
          Thread.sleep(1000);
        } catch (InterruptedException iex) {
          throw new IOException(iex);
        }
      }
    }
  }

  /**
   * Return the configuration that should be used by this instance of AvatarNode
   * Copy fsimages from the remote shared device.
   */
  static Configuration copyFsImage(Configuration conf, StartupInfo startInfo)
    throws IOException {
    String img0 = conf.get("dfs.name.dir.shared0");
    String img1 = conf.get("dfs.name.dir.shared1");
    String edit0 = conf.get("dfs.name.edits.dir.shared0");
    String edit1 = conf.get("dfs.name.edits.dir.shared1");
    Collection<String> namedirs = conf.getStringCollection("dfs.name.dir");
    Collection<String> editsdir = conf.getStringCollection("dfs.name.edits.dir");
    String msg = "";

    if (img0 == null || img0.isEmpty()) {
      msg += "No values specified in dfs.name.dir.share0";
    }
    if (img1 == null || img1.isEmpty()) {
      msg += " No values specified in dfs.name.dir.share1";
    }
    if (edit0 == null || edit0.isEmpty()) {
      msg += " No values specified in dfs.name.edits.dir.share0";
    }
    if (edit1 == null || edit1.isEmpty()) {
      msg += " No values specified in dfs.name.edits.dir.share1";
    }
    if (msg.length() != 0) {
      LOG.info(msg);
      throw new IOException(msg);
    }

    // verify that the shared dirctories are not specified as dfs.name.dir
    for (String str : namedirs) {
      if (str.equalsIgnoreCase(img0)) {
        msg = "The name specified in dfs.name.dir.shared0 " +
              img0 + " is already part of dfs.name.dir ";
      }
      if (str.equalsIgnoreCase(img1)) {
        msg += " The name specified in dfs.name.dir.shared1 " +
              img1 + " is already part of dfs.name.dir ";
      }
    }
    if (msg.length() != 0) {
      LOG.info(msg);
      throw new IOException(msg);
    }
    // verify that the shared edits directories are not specified as dfs.name.edits.dir
    for (String str : editsdir) {
      if (str.equalsIgnoreCase(edit0)) {
        msg = "The name specified in dfs.name.edits.dir.shared0 " +
              img0 + " is already part of dfs.name.dir ";
      }
      if (str.equalsIgnoreCase(edit1)) {
        msg += " The name specified in dfs.name.edits.dir.shared1 " +
              img1 + " is already part of dfs.name.dir ";
      }
    }
    if (msg.length() != 0) {
      LOG.info(msg);
      throw new IOException(msg);
    }

    File primary = new File(img0);
    File standby = new File(img1);
    FileSystem localFs = FileSystem.getLocal(conf).getRaw();
    File src = null;
    File dest = null;
    File srcedit = null;
    File destedit = null;

    //
    // if we are instance one then copy from primary to secondary
    // otherwise copy from secondary to primary.
    //
    if (startInfo.instance == InstanceId.NODEONE) {
      src = primary;
      dest = standby;
      srcedit = new File(edit0);
      destedit = new File(edit1);
    } else if (startInfo.instance == InstanceId.NODEZERO) {
      dest = primary;
      src = standby;
      destedit = new File(edit0);
      srcedit = new File(edit1);
    }

    // copy fsimage directory if needed
    if (src.exists() && startInfo.isStandby) {
      copyFiles(localFs, src, dest, conf);

      // Remove the lock file from the newly synced directory
      File lockfile = new File(dest, STORAGE_FILE_LOCK);
      lockfile.delete();

      // Remove fsimage.ckpt if it exists.
      File ckptfile = new File(dest.toString() + IMAGENEW);
      ckptfile.delete();

      // Now, copy from the now-updated shared directory to all other
      // local dirs specified in fs.name.dir
      src = dest;
      if (!namedirs.isEmpty()) {
        for (String str : namedirs) {
          dest = new File(str);
          copyFiles(localFs, src, dest, conf);
        }
      }
    }

    // copy edits directory if needed
    if (srcedit.exists() && startInfo.isStandby) {
      copyFiles(localFs, srcedit, destedit, conf);

      // Remove the lock file from the newly synced directory
      File lockfile = new File(destedit, STORAGE_FILE_LOCK);
      if (lockfile.exists() && lockfile.delete() == false) {
        throw new IOException("Unable to delete lock file " + lockfile);
      }

      // Remove edits and edits.new. Create empty edits file.
      File efile = new File(destedit.toString() + EDITSFILE);
      if (efile.exists() && efile.delete() == false) {
        throw new IOException("Unable to delete edits file " + efile);
      }
      efile = new File(destedit + EDITSNEW);
      efile.delete();
      createEditsFile(destedit.toString());

      // Now, copy from the now-updated shared directory to all other
      // local dirs specified in fs.name.edits.dir
      srcedit = destedit;
      if (!editsdir.isEmpty()) {
        for (String str : editsdir) {
          destedit = new File(str);
          copyFiles(localFs, srcedit, destedit, conf);
        }
      }
    }

    // allocate a new configuration and update fs.name.dir approprately
    // The shared device should be the first in the list.
    Configuration newconf = new Configuration(conf);
    StringBuffer buf = new StringBuffer();
    if (startInfo.instance == InstanceId.NODEONE) {
      buf.append(img1);
    } else if (startInfo.instance == InstanceId.NODEZERO) {
      buf.append(img0);
    }
    for (String str : namedirs) {
      buf.append(",");
      buf.append(str);
    }
    newconf.set("dfs.name.dir", buf.toString());
    buf = null;

    // update fs.name.edits.dir approprately in the new configuration
    // The shared device should be the first in the list.
    StringBuffer buf1 = new StringBuffer();
    if (startInfo.instance == InstanceId.NODEONE) {
      buf1.append(edit1);
    } else if (startInfo.instance == InstanceId.NODEZERO) {
      buf1.append(edit0);
    }
    for (String str : editsdir) {
      buf1.append(",");
      buf1.append(str);
    }
    newconf.set("dfs.name.edits.dir", buf1.toString());

    return newconf;
  }
 
  static void cleanupBackup(Configuration conf, File origin) {
    File root = origin.getParentFile();
    final String originName = origin.getName();
    String[] backups = root.list(new FilenameFilter() {
      @Override
      public boolean accept(File dir, String name) {
        if (!name.startsWith(originName) || name.equals(originName))
          return false;
        try {
          dateForm.parse(name.substring(name.indexOf(File.pathSeparator) + 1));
        } catch (ParseException pex) {
          return false;
        }
        return true;
      }
    });

    Arrays.sort(backups, new Comparator<String>() {

      @Override
      public int compare(String back1, String back2) {
        try {
          Date date1 = dateForm.parse(back1.substring(back1
              .indexOf(File.pathSeparator) + 1));
          Date date2 = dateForm.parse(back2.substring(back2
              .indexOf(File.pathSeparator) + 1));
          // Sorting in reverse order, from later dates to earlier
          return date2.compareTo(date1);
        } catch (ParseException pex) {
          return 0;
        }
      }
    });
   
    int copiesToKeep = conf.getInt("standby.image.copies.tokeep", 0);
    int daysToKeep = conf.getInt("standby.image.days.tokeep", 0);
    if (copiesToKeep == 0 && daysToKeep == 0) {
      // Do not delete anything in this case
      return;
    }
    Date now = new Date(now());
    int copies = 0;
    for (String backup : backups) {
      copies++;
      Date backupDate = null;
      try {
        backupDate = dateForm.parse(backup.substring(backup
            .indexOf(File.pathSeparator) + 1));
      } catch (ParseException pex) {
        // This should not happen because of the
        // way we construct the list
      }
      long backupAge = now.getTime() - backupDate.getTime();
      if (copies > copiesToKeep && backupAge > daysToKeep * 24 * 60 * 60 * 1000) {
        // This backup is both old and we have enough of newer backups stored -
        // delete
        try {
          FileUtil.fullyDelete(new File(root, backup));
          LOG.info("Deleted backup " + new File(root, backup));
        } catch (IOException iex) {
          LOG.error("Error deleting backup " + new File(root, backup), iex);
        }
      }
    }

  }

  public static Configuration updateAddressConf(Configuration conf, InstanceId instance) {
    Configuration newconf = new Configuration(conf);
    // if we are starting as the other namenode, then change the
    // default URL to make the namenode attach to the appropriate URL
    if (instance == InstanceId.NODEZERO) {
      String fs = conf.get("dfs.http.address0");
      if (fs != null) {
        newconf.set("dfs.http.address", fs);
      }
      fs = conf.get("dfs.namenode.dn-address0");
      if (fs != null) {
        newconf.set("dfs.namenode.dn-address", fs);
      }
      fs = conf.get(AvatarNode.DFS_NAMENODE_RPC_ADDRESS0_KEY);
      if (fs != null) {
        newconf.set(AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY, fs);
        newconf.set("fs.default.name0", fs);
        conf.set("fs.default.name0", fs);
      }
      fs = conf.get("fs.default.name0");
      if (fs != null) {
        newconf.set("fs.default.name", fs);
      }
    }
    if (instance == InstanceId.NODEONE) {
      String fs = conf.get("dfs.http.address1");
      if (fs != null) {
        newconf.set("dfs.http.address", fs);
      }
      fs = conf.get("dfs.namenode.dn-address1");
      if (fs != null) {
        newconf.set("dfs.namenode.dn-address", fs);
      }
      fs = conf.get(AvatarNode.DFS_NAMENODE_RPC_ADDRESS1_KEY);
      if (fs != null) {
        newconf.set(AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY, fs);
        newconf.set("fs.default.name1", fs);
        conf.set("fs.default.name1", fs);
      }
      fs = conf.get("fs.default.name1");
      if (fs != null) {
        newconf.set("fs.default.name", fs);
      }
    }
    return newconf;
  }

  /**
   * Returns the address of the remote namenode
   */
  InetSocketAddress getRemoteNamenodeAddress(Configuration conf)
    throws IOException {
    String fs = null;
    if (instance == InstanceId.NODEZERO) {
      fs = conf.get(AvatarNode.DFS_NAMENODE_RPC_ADDRESS1_KEY);
      if (fs == null)
        fs = conf.get("fs.default.name1");
    } else if (instance == InstanceId.NODEONE) {
      fs = conf.get(AvatarNode.DFS_NAMENODE_RPC_ADDRESS0_KEY);
      if (fs == null)
        fs = conf.get("fs.default.name0");
    } else {
      throw new IOException("Unknown instance " + instance);
    }
    if (fs != null) {
      conf = new Configuration(conf);
      conf.set("fs.default.name", fs);
    }
    return NameNode.getAddress(conf);
  }

  /**
   * Returns the name of the http server of the local namenode
   */
  String getRemoteNamenodeHttpName(Configuration conf)
    throws IOException {
    if (instance == InstanceId.NODEZERO) {
      return conf.get("dfs.http.address1");
    } else if (instance == InstanceId.NODEONE) {
      return conf.get("dfs.http.address0");
    } else {
      throw new IOException("Unknown instance " + instance);
    }
  }

  /**
   * Create an empty edits log
   */
  static void createEditsFile(String editDir) throws IOException {
    File editfile = new File(editDir + EDITSFILE);
    FileOutputStream fp = new FileOutputStream(editfile);
    DataOutputBuffer buf = new DataOutputBuffer(1024);
    buf.writeInt(FSConstants.LAYOUT_VERSION);
    buf.writeTo(fp);
    buf.close();
    fp.close();
  }

  /**
   * Return the edits file that is shared.
   */
  File getSharedEditsFile(Configuration conf) throws IOException {
    String edit = null;
    if (instance == InstanceId.NODEZERO) {
      edit = conf.get("dfs.name.edits.dir.shared0");
    } else if (instance == InstanceId.NODEONE) {
      edit = conf.get("dfs.name.edits.dir.shared1");
    } else {
      LOG.info("Instance is invalid. " + instance);
      throw new IOException("Instance is invalid. " + instance);
    }
    return new File(edit + EDITSFILE);
  }

  /**
   * Return the edits file of the remote NameNode
   */
  File getRemoteEditsFile(Configuration conf) throws IOException {
    String edit = null;
    if (instance == InstanceId.NODEZERO) {
      edit = conf.get("dfs.name.edits.dir.shared1");
    } else if (instance == InstanceId.NODEONE) {
      edit = conf.get("dfs.name.edits.dir.shared0");
    } else {
      LOG.info("Instance is invalid. " + instance);
      throw new IOException("Instance is invalid. " + instance);
    }
    return new File(edit + EDITSFILE);
  }

  /**
   * Return the image file of the remote NameNode
   */
  File getRemoteImageFile(Configuration conf) throws IOException {
    String image = null;
    if (instance == InstanceId.NODEZERO) {
      image = conf.get("dfs.name.dir.shared1");
    } else if (instance == InstanceId.NODEONE) {
      image = conf.get("dfs.name.dir.shared0");
    } else {
      LOG.info("Instance is invalid. " + instance);
      throw new IOException("Instance is invalid. " + instance);
    }
    return new File(image + IMAGEFILE);
  }

  /**
   * Returns the image file used by this avatar, note that this might not
   * necessarily be the local image file but it would be the image file
   * for this Avatar. For example if this is the one instance it could return
   * the image file under the NFS /one directory, but that is fine since that
   * image belongs to the one instance.
   */
  File getAvatarImageFile(Configuration conf) throws IOException {
    File[] images = getFSImage().getImageFiles();
    if (images == null || images.length == 0) {
      throw new IOException("No image files found for this Avatar");
    }
    return images[0];
  }

  /**
   * Return the edits.new file of the remote NameNode
   */
  File getRemoteEditsFileNew(Configuration conf) throws IOException {
    String edit = null;
    if (instance == InstanceId.NODEZERO) {
      edit = conf.get("dfs.name.edits.dir.shared1");
    } else if (instance == InstanceId.NODEONE) {
      edit = conf.get("dfs.name.edits.dir.shared0");
    } else {
      LOG.info("Instance is invalid. " + instance);
      throw new IOException("Instance is invalid. " + instance);
    }
    return new File(edit + EDITSNEW);
  }
 
  /**
   * Return the fstime file of the remote NameNode
   */
  File getRemoteTimeFile(Configuration conf) throws IOException {
    String edit = null;
    if (instance == InstanceId.NODEZERO) {
      edit = conf.get("dfs.name.edits.dir.shared1");
    } else if (instance == InstanceId.NODEONE) {
      edit = conf.get("dfs.name.edits.dir.shared0");
    } else {
      LOG.info("Instance is invalid. " + instance);
      throw new IOException("Instance is invalid. " + instance);
    }
    return new File(edit + TIMEFILE);
  }

  /**
   * Reads the timestamp of the last checkpoint from the remote fstime file.
   */
  long readRemoteFstime(Configuration conf)
    throws IOException {
    String edit = null;
    if (instance == InstanceId.NODEZERO) {
      edit = conf.get("dfs.name.edits.dir.shared1");
    } else if (instance == InstanceId.NODEONE) {
      edit = conf.get("dfs.name.edits.dir.shared0");
    } else {
      LOG.info("Instance is invalid. " + instance);
      throw new IOException("Instance is invalid. " + instance);
    }
    return readFstime(edit);
  }

  long readFstime(String location) throws IOException {
    File timeFile = new File(location + TIMEFILE);
    long timeStamp = 0L;
    DataInputStream in = null;
    try {
      in = new DataInputStream(new FileInputStream(timeFile));
      timeStamp = in.readLong();
    } catch (IOException e) {
      if (!timeFile.exists()) {
        String msg = "Error reading checkpoint time file " + timeFile +
                     " file does not exist.";
        LOG.error(msg);
        throw new IOException(msg + e);
      } else if (!timeFile.canRead()) {
        String msg = "Error reading checkpoint time file " + timeFile +
                     " cannot read file of size " + timeFile.length() +
                     " last modified " +
                     dateForm.format(new Date(timeFile.lastModified()));
        LOG.error(msg);
        throw new IOException(msg + e);
      } else {
        String msg = "Error reading checkpoint time file " + timeFile;
        LOG.error(msg);
        throw new IOException(msg + e);
      }
    } finally {
      if (in != null) {
        in.close();
      }
    }
    return timeStamp;
  }

  long readLocalFstime(Configuration conf) throws IOException {
    String edits = null;
    if (instance == InstanceId.NODEZERO) {
      edits = conf.get("dfs.name.edits.dir.shared0");
    } else {
      edits = conf.get("dfs.name.edits.dir.shared1");
    }

    long editsTime = readFstime(edits);

    String image = null;
    if (instance == InstanceId.NODEZERO) {
      image = conf.get("dfs.name.dir.shared0");
    } else {
      image = conf.get("dfs.name.dir.shared1");
    }

    if (editsTime == readFstime(image)) {
      return editsTime;
    }
    throw new IOException("The checkpoint time of the local fsimage does not" +
    " match the time of the local edits");
  }

  /**
   * Returns the starting checkpoint time of this AvatarNode
   */
  long getStartCheckpointTime() {
    return startCheckpointTime;
  }

  /**
   * Sets the starting checkpoint time of this AvatarNode
   */
  void setStartCheckpointTime(Configuration conf)
    throws IOException {
    startCheckpointTime = readRemoteFstime(conf);
  }

  void setStartCheckpointTime(long time) {
    startCheckpointTime = time;
  }

  /**
   * Indicates that the AvatarNode shoudl restart
   */
  void doRestart() {
    runInfo.doRestart = true;
  }

  /**
   * Returns true if both edits and edits.new for the
   * remote namenode exists.
   */
  boolean twoEditsFile(Configuration conf) throws IOException{
    File f1 = getRemoteEditsFile(conf);
    File f2 = getRemoteEditsFileNew(conf);
    return f1.exists() && f2.exists();
  }

  /**
   * Returns the size of the edits file for the remote
   * namenode.
   */
  long editSize(Configuration conf) throws IOException{
    return getRemoteEditsFile(conf).length();
  }

  /**
   * Current system time.
   * @return current time in msec.
   */
  static long now() {
    return System.currentTimeMillis();
  }

  /**
   * Verify that configured directories exist, then
   * Interactively confirm that formatting is desired
   * for each existing directory and format them.
   *
   * @param conf
   * @param isConfirmationNeeded
   * @return true if formatting was aborted, false otherwise
   * @throws IOException
   */
  private static boolean format(Configuration conf,
                                boolean isConfirmationNeeded
                                ) throws IOException {
    boolean allowFormat = conf.getBoolean("dfs.namenode.support.allowformat",
                                          true);
    if (!allowFormat) {
      throw new IOException("The option dfs.namenode.support.allowformat is "
                            + "set to false for this filesystem, so it "
                            + "cannot be formatted. You will need to set "
                            + "dfs.namenode.support.allowformat parameter "
                            + "to true in order to format this filesystem");
    }
    Collection<File> dirsToFormat = FSNamesystem.getNamespaceDirs(conf);
    Collection<File> editDirsToFormat =
                 FSNamesystem.getNamespaceEditsDirs(conf);
    for(Iterator<File> it = dirsToFormat.iterator(); it.hasNext();) {
      File curDir = it.next();
      if (!curDir.exists())
        continue;
      if (isConfirmationNeeded) {
        System.err.print("Re-format filesystem in " + curDir +" ? (Y or N) ");
        if (!(System.in.read() == 'Y')) {
          System.err.println("Format aborted in "+ curDir);
          return true;
        }
        while(System.in.read() != '\n'); // discard the enter-key
      }
    }

    FSNamesystem nsys = new FSNamesystem(new FSImage(dirsToFormat,
                                         editDirsToFormat), conf);
    nsys.dir.fsImage.format();
    return false;
  }

  private static boolean finalize(Configuration conf,
                               boolean isConfirmationNeeded
                               ) throws IOException {
    Collection<File> dirsToFormat = FSNamesystem.getNamespaceDirs(conf);
    Collection<File> editDirsToFormat =
                               FSNamesystem.getNamespaceEditsDirs(conf);
    FSNamesystem nsys = new FSNamesystem(new FSImage(dirsToFormat,
                                         editDirsToFormat), conf);
    System.err.print(
        "\"finalize\" will remove the previous state of the files system.\n"
        + "Recent upgrade will become permanent.\n"
        + "Rollback option will not be available anymore.\n");
    if (isConfirmationNeeded) {
      System.err.print("Finalize filesystem state ? (Y or N) ");
      if (!(System.in.read() == 'Y')) {
        System.err.println("Finalize aborted.");
        return true;
      }
      while(System.in.read() != '\n'); // discard the enter-key
    }
    nsys.dir.fsImage.finalizeUpgrade();
    return false;
  }
 
  protected Map<NameNodeKey, String> getNameNodeSpecificKeys(){
    Map<NameNodeKey, String> map = new HashMap<NameNodeKey, String>();  
    try{
     
      map.put(new NameNodeKey("Last applied transaction id", NameNodeKey.BOTH),
          toStr(getFSImage().getEditLog().getLastWrittenTxId()));
        
      if (currentAvatar == Avatar.STANDBY) {
        map.put(new NameNodeKey("Standby: ignore datanodes",
            NameNodeKey.STANDBY), toStr(this.ignoreDatanodes()));
        map.put(new NameNodeKey("Standby: ingest state", NameNodeKey.STANDBY),
            toStr((standby == null) ? "" : standby.currentIngestState));
        map.put(new NameNodeKey("Standby: ingest fell behind", NameNodeKey.STANDBY),
            toStr((standby == null) ? "" : standby.fellBehind()));
        map.put(new NameNodeKey("Standby: ingest lag bytes", NameNodeKey.STANDBY),
            toStr((standby == null) ? 0L : standby.getLagBytes()));
        map.put(new NameNodeKey("Standby: checkpoint status", NameNodeKey.STANDBY),
            toStr((standby == null) ? "" : standby.getCheckpointStatus()));
        map.put(new NameNodeKey("Standby: failover in progress",
            NameNodeKey.STANDBY), toStr(standbySafeMode.failoverInProgress()));
        if (standbySafeMode.failoverInProgress()) {
          map.put(new NameNodeKey("Standby: failover outstanding heartbeats",
              NameNodeKey.STANDBY), toStr(standbySafeMode
              .getOutStandingHeartbeats().size()));
          map.put(new NameNodeKey("Standby: failover outstanding reports",
              NameNodeKey.STANDBY), toStr(standbySafeMode
              .getOutStandingReports().size()));
        }

      } else {
        map.put(new NameNodeKey("Checkpoint state", NameNodeKey.ACTIVE),
            this.getFSImage().ckptState.toString());
      }
    } catch (Exception e) {
      // send partial information
      LOG.error(e.toString());
   
    return map;
  }
 
  protected boolean getIsPrimary() {
    return currentAvatar == Avatar.ACTIVE;
  }
 
  private String toStr(Object o){
    return o.toString();
  }

  public static class RunInfo {
    volatile boolean doRestart;
    volatile boolean shutdown;
    volatile boolean isRunning;

    public RunInfo(boolean doRestart, boolean shutdown, boolean isRunning) {
      this.doRestart = doRestart;
      this.shutdown = shutdown;
      this.isRunning = isRunning;
    }

    public RunInfo() {
      this.doRestart = false;
      this.shutdown = false;
      this.isRunning = true;
    }

  }
 
  public InetSocketAddress getNameNodeAddress() {
    return serverAddress;
  }

  public StandbySafeMode getStandbySafeMode() {
    return this.standbySafeMode;
  }

  /**
   */
  public static void main(String argv[]) throws Exception {
    Exception exception = null;
    AvatarNode avatarnode = null;
    RunInfo runInfo = new RunInfo();
    do {
      runInfo.doRestart = false;
      runInfo.isRunning = true;
      exception = null;
      try {
        StringUtils.startupShutdownMessage(AvatarNode.class, argv, LOG);
        avatarnode = createAvatarNode(argv, null, runInfo);
        if (avatarnode != null) {
          avatarnode.waitForRestart();
        }
      } catch (Throwable e) {
        LOG.error(StringUtils.stringifyException(e));
        if (runInfo.doRestart) {
          LOG.error("AvatarNode restarting...");
        } else {
          exception = new Exception(StringUtils.stringifyException(e));
        }
      }
    } while (runInfo.doRestart == true);
    if (runInfo.shutdown) {
      avatarnode.stopRPC(true);
    }
    if (exception != null) {
      LOG.fatal("Exception running avatarnode. Shutting down", exception);
      Runtime.getRuntime().exit(1);
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.namenode.AvatarNode

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.