Package org.apache.hadoop.dfs

Source Code of org.apache.hadoop.dfs.FSNamesystem$Replicator

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.dfs;

import org.apache.commons.logging.*;

import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapred.StatusHttpServer;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.fs.Path;

import java.io.*;
import java.util.*;

import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/***************************************************
* FSNamesystem does the actual bookkeeping work for the
* DataNode.
*
* It tracks several important tables.
*
* 1)  valid fsname --> blocklist  (kept on disk, logged)
* 2)  Set of all valid blocks (inverted #1)
* 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
* 4)  machine --> blocklist (inverted #2)
* 5)  LRU cache of updated-heartbeat machines
***************************************************/
class FSNamesystem implements FSConstants {
    public static final Log LOG = LogFactory.getLog("org.apache.hadoop.fs.FSNamesystem");

    //
    // Stores the correct file name hierarchy
    //
    FSDirectory dir;

    //
    // Stores the block-->datanode(s) map.  Updated only in response
    // to client-sent information.
    // Mapping: Block -> TreeSet<DatanodeDescriptor>
    //
    Map<Block, SortedSet<DatanodeDescriptor>> blocksMap =
                              new HashMap<Block, SortedSet<DatanodeDescriptor>>();

    /**
     * Stores the datanode -> block map. 
     * <p>
     * Done by storing a set of {@link DatanodeDescriptor} objects, sorted by
     * storage id. In order to keep the storage map consistent it tracks
     * all storages ever registered with the namenode.
     * A descriptor corresponding to a specific storage id can be
     * <ul>
     * <li>added to the map if it is a new storage id;</li>
     * <li>updated with a new datanode started as a replacement for the old one
     * with the same storage id; and </li>
     * <li>removed if and only if an existing datanode is restarted to serve a
     * different storage id.</li>
     * </ul> <br>
     * The list of the {@link DatanodeDescriptor}s in the map is checkpointed
     * in the namespace image file. Only the {@link DatanodeInfo} part is
     * persistent, the list of blocks is restored from the datanode block
     * reports.
     * <p>
     * Mapping: StorageID -> DatanodeDescriptor
     */
    Map<String, DatanodeDescriptor> datanodeMap =
                                      new TreeMap<String, DatanodeDescriptor>();

    //
    // Keeps a Collection for every named machine containing
    // blocks that have recently been invalidated and are thought to live
    // on the machine in question.
    // Mapping: StorageID -> ArrayList<Block>
    //
    private Map<String, Collection<Block>> recentInvalidateSets =
                                      new TreeMap<String, Collection<Block>>();

    //
    // Keeps a TreeSet for every named node.  Each treeset contains
    // a list of the blocks that are "extra" at that location.  We'll
    // eventually remove these extras.
    // Mapping: StorageID -> TreeSet<Block>
    //
    private Map<String, Collection<Block>> excessReplicateMap =
                                      new TreeMap<String, Collection<Block>>();

    //
    // Keeps track of files that are being created, plus the
    // blocks that make them up.
    // Mapping: fileName -> FileUnderConstruction
    //
    Map<UTF8, FileUnderConstruction> pendingCreates =
                                  new TreeMap<UTF8, FileUnderConstruction>();

    //
    // Keeps track of the blocks that are part of those pending creates
    // Set of: Block
    //
    Collection<Block> pendingCreateBlocks = new TreeSet<Block>();

    //
    // Stats on overall usage
    //
    long totalCapacity = 0, totalRemaining = 0;

    // total number of connections per live datanode
    int totalLoad = 0;


    //
    // For the HTTP browsing interface
    //
    StatusHttpServer infoServer;
    int infoPort;
    String infoBindAddress;
    Date startTime;
   
    //
    Random r = new Random();

    /**
     * Stores a set of DatanodeDescriptor objects.
     * This is a subset of {@link #datanodeMap}, containing nodes that are
     * considered alive.
     * The {@link HeartbeatMonitor} periodically checks for outdated entries,
     * and removes them from the list.
     */
    ArrayList<DatanodeDescriptor> heartbeats = new ArrayList<DatanodeDescriptor>();

    //
    // Store set of Blocks that need to be replicated 1 or more times.
    // We also store pending replication-orders.
    // Set of: Block
    //
    private UnderReplicationBlocks neededReplications = new UnderReplicationBlocks();
    private Collection<Block> pendingReplications = new TreeSet<Block>();

    //
    // Used for handling lock-leases
    // Mapping: leaseHolder -> Lease
    //
    private Map<UTF8, Lease> leases = new TreeMap<UTF8, Lease>();
    // Set of: Lease
    private SortedSet<Lease> sortedLeases = new TreeSet<Lease>();

    //
    // Threaded object that checks to see if we have been
    // getting heartbeats from all clients.
    //
    Daemon hbthread = null;   // HeartbeatMonitor thread
    Daemon lmthread = null;   // LeaseMonitor thread
    Daemon smmthread = null// SafeModeMonitor thread
    boolean fsRunning = true;
    long systemStart = 0;

    //  The maximum number of replicates we should allow for a single block
    private int maxReplication;
    //  How many outgoing replication streams a given node should have at one time
    private int maxReplicationStreams;
    // MIN_REPLICATION is how many copies we need in place or else we disallow the write
    private int minReplication;
    // heartbeatRecheckInterval is how often namenode checks for expired datanodes
    private long heartbeatRecheckInterval;
    // heartbeatExpireInterval is how long namenode waits for datanode to report
    // heartbeat
    private long heartbeatExpireInterval;

    public static FSNamesystem fsNamesystemObject;
    private String localMachine;
    private int port;
    private SafeModeInfo safeMode;  // safe mode information
   
    // datanode networktoplogy
    NetworkTopology clusterMap = new NetworkTopology();
    // for block replicas placement
    Replicator replicator = new Replicator();

    /**
     * dirs is a list oif directories where the filesystem directory state
     * is stored
     */
    public FSNamesystem(File[] dirs,
                        String hostname,
                        int port,
                        NameNode nn, Configuration conf) throws IOException {
        fsNamesystemObject = this;
        this.maxReplication = conf.getInt("dfs.replication.max", 512);
        this.minReplication = conf.getInt("dfs.replication.min", 1);
        if( minReplication <= 0 )
          throw new IOException(
              "Unexpected configuration parameters: dfs.replication.min = "
              + minReplication
              + " must be greater than 0" );
        if( maxReplication >= (int)Short.MAX_VALUE )
          throw new IOException(
              "Unexpected configuration parameters: dfs.replication.max = "
              + maxReplication + " must be less than " + (Short.MAX_VALUE) );
        if( maxReplication < minReplication )
          throw new IOException(
              "Unexpected configuration parameters: dfs.replication.min = "
              + minReplication
              + " must be less than dfs.replication.max = "
              + maxReplication );
        this.maxReplicationStreams = conf.getInt("dfs.max-repl-streams", 2);
        long heartbeatInterval = conf.getLong("dfs.heartbeat.interval", 3) * 1000;
        this.heartbeatRecheckInterval = 5 * 60 * 1000; // 5 minutes
        this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval +
            10 * heartbeatInterval;

        this.localMachine = hostname;
        this.port = port;
        this.dir = new FSDirectory(dirs);
        this.dir.loadFSImage( conf );
        this.safeMode = new SafeModeInfo( conf );
        setBlockTotal();
        this.hbthread = new Daemon(new HeartbeatMonitor());
        this.lmthread = new Daemon(new LeaseMonitor());
        hbthread.start();
        lmthread.start();
        this.systemStart = now();
        this.startTime = new Date(systemStart);

        this.infoPort = conf.getInt("dfs.info.port", 50070);
        this.infoBindAddress = conf.get("dfs.info.bindAddress", "0.0.0.0");
        this.infoServer = new StatusHttpServer("dfs",infoBindAddress, infoPort, false);
        this.infoServer.setAttribute("name.system", this);
        this.infoServer.setAttribute("name.node", nn);
        this.infoServer.setAttribute("name.conf", conf);
        this.infoServer.addServlet("fsck", "/fsck", FsckServlet.class);
        this.infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
        this.infoServer.start();
    }

    /**
     * dirs is a list of directories where the filesystem directory state
     * is stored
     */
    FSNamesystem(FSImage fsImage) throws IOException {
        fsNamesystemObject = this;
        this.dir = new FSDirectory(fsImage);
    }

    /** Return the FSNamesystem object
     *
     */
    public static FSNamesystem getFSNamesystem() {
        return fsNamesystemObject;
    }

    /** Close down this filesystem manager.
     * Causes heartbeat and lease daemons to stop; waits briefly for
     * them to finish, but a short timeout returns control back to caller.
     */
    public void close() {
      synchronized (this) {
        fsRunning = false;
      }
        try {
            infoServer.stop();
            hbthread.join(3000);
        } catch (InterruptedException ie) {
        } finally {
          // using finally to ensure we also wait for lease daemon
          try {
            lmthread.join(3000);
          } catch (InterruptedException ie) {
          } finally {
              try {
                dir.close();
              } catch (IOException ex) {
                  // do nothing
              }
          }
        }
    }
   
    /* get replication factor of a block */
    private int getReplication( Block block ) {
        FSDirectory.INode fileINode = dir.getFileByBlock(block);
        if( fileINode == null ) { // block does not belong to any file
            return 0;
        } else {
            return fileINode.getReplication();
        }
    }

    /* Class for keeping track of under replication blocks
     * Blocks have replication priority, with priority 0 indicating the highest
     * Blocks have only one replicas has the highest
     */
    private class UnderReplicationBlocks {
        private static final int LEVEL = 3;
        TreeSet<Block>[] priorityQueues = new TreeSet[LEVEL];
       
        /* constructor */
        UnderReplicationBlocks() {
            for(int i=0; i<LEVEL; i++) {
                priorityQueues[i] = new TreeSet<Block>();
            }
        }
       
        /* Return the total number of under replication blocks */
        synchronized int size() {
            int size = 0;
            for( int i=0; i<LEVEL; i++ ) {
                size += priorityQueues[i].size();
            }
            return size;
        }
       
        /* Return the priority of a block
        * @param block a under replication block
        * @param curReplicas current number of replicas of the block
        * @param expectedReplicas expected number of replicas of the block
        */
        private int getPriority(Block block,
                int curReplicas, int expectedReplicas) {
            if (curReplicas>=expectedReplicas) {
                return LEVEL; // no need to replicate
            } else if(curReplicas==1) {
                return 0; // highest priority
            } else if(curReplicas*3<expectedReplicas) {
                return 1;
            } else {
                return 2;
            }
        }
       
        /* add a block to a under replication queue according to its priority
         * @param block a under replication block
         * @param curReplicas current number of replicas of the block
         * @param expectedReplicas expected number of replicas of the block
         */
        synchronized boolean add(
            Block block, int curReplicas, int expectedReplicas) {
            if(expectedReplicas <= curReplicas) {
                return false;
            }
            int priLevel = getPriority(block, curReplicas, expectedReplicas);
            if( priorityQueues[priLevel].add(block) ) {
                NameNode.stateChangeLog.debug(
                        "BLOCK* NameSystem.UnderReplicationBlock.add:"
                      + block.getBlockName()
                      + " has only "+curReplicas
                      + " replicas and need " + expectedReplicas
                      + " replicas so is added to neededReplications"
                      + " at priority level " + priLevel );
                return true;
            }
            return false;
        }

        /* add a block to a under replication queue */
        synchronized boolean add(Block block) {
            int curReplicas = countContainingNodes(blocksMap.get(block));
            int expectedReplicas = getReplication(block);
            return add(block, curReplicas, expectedReplicas);
        }
       
        /* remove a block from a under replication queue */
        synchronized boolean remove(Block block,
                int oldReplicas, int oldExpectedReplicas) {
            if(oldExpectedReplicas <= oldReplicas) {
                return false;
            }
            int priLevel = getPriority(block, oldReplicas, oldExpectedReplicas);
            return remove(block, priLevel);
        }
       
        /* remove a block from a under replication queue given a priority*/
        private boolean remove(Block block, int priLevel ) {
            if( priorityQueues[priLevel].remove(block) ) {
                NameNode.stateChangeLog.debug(
                     "BLOCK* NameSystem.UnderReplicationBlock.remove: "
                   + "Removing block " + block.getBlockName()
                   + " from priority queue "+ priLevel );
                return true;
            } else {
                for(int i=0; i<LEVEL; i++) {
                    if( i!=priLevel && priorityQueues[i].remove(block) ) {
                        NameNode.stateChangeLog.debug(
                             "BLOCK* NameSystem.UnderReplicationBlock.remove: "
                           + "Removing block " + block.getBlockName()
                           + " from priority queue "+ i );
                        return true;
                    }
                }
            }
            return false;
        }
       
        /* remove a block from a under replication queue */
        synchronized boolean remove(Block block) {
            int curReplicas = countContainingNodes(blocksMap.get(block));
            int expectedReplicas = getReplication(block);
            return remove(block, curReplicas, expectedReplicas);
        }
       
        /* update the priority level of a block */
        synchronized void update(Block block,
                int curReplicasDelta, int expectedReplicasDelta) {
            int curReplicas = countContainingNodes(blocksMap.get(block));
            int curExpectedReplicas = getReplication(block);
            int oldReplicas = curReplicas-curReplicasDelta;
            int oldExpectedReplicas = curExpectedReplicas-expectedReplicasDelta;
            int curPri = getPriority(block, curReplicas, curExpectedReplicas);
            int oldPri = getPriority(block, oldReplicas, oldExpectedReplicas);
            if( oldPri != LEVEL && oldPri != curPri ) {
                remove(block, oldPri);
            }
            if( curPri != LEVEL && oldPri != curPri
                    && priorityQueues[curPri].add(block)) {
                NameNode.stateChangeLog.debug(
                        "BLOCK* NameSystem.UnderReplicationBlock.update:"
                      + block.getBlockName()
                      + " has only "+curReplicas
                      + " replicas and need " + curExpectedReplicas
                      + " replicas so is added to neededReplications"
                      + " at priority level " + curPri );
            }
        }
       
        /* return a iterator of all the under replication blocks */
        synchronized Iterator<Block> iterator() {
            return new Iterator<Block>() {
                int level;
                Iterator<Block>[] iterator = new Iterator[LEVEL];
               
                {
                    level=0;
                    for(int i=0; i<LEVEL; i++) {
                        iterator[i] = priorityQueues[i].iterator();
                    }
                }
               
                private void update() {
                    while( level< LEVEL-1 && !iterator[level].hasNext()  ) {
                        level++;
                    }
                }
               
                public Block next() {
                    update();
                    return iterator[level].next();
                }
               
                public boolean hasNext() {
                    update();
                    return iterator[level].hasNext();
                }
               
                public void remove() {
                    iterator[level].remove();
                }
            };
        }
    }
   
    /////////////////////////////////////////////////////////
    //
    // These methods are called by HadoopFS clients
    //
    /////////////////////////////////////////////////////////
    /**
     * The client wants to open the given filename.  Return a
     * list of (block,machineArray) pairs.  The sequence of unique blocks
     * in the list indicates all the blocks that make up the filename.
     *
     * The client should choose one of the machines from the machineArray
     * at random.
     */
    public Object[] open(String clientMachine, UTF8 src) {
        Object results[] = null;
        Block blocks[] = dir.getFile(src);
        if (blocks != null) {
            results = new Object[2];
            DatanodeDescriptor machineSets[][] = new DatanodeDescriptor[blocks.length][];

            for (int i = 0; i < blocks.length; i++) {
              SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(blocks[i]);
                if (containingNodes == null) {
                    machineSets[i] = new DatanodeDescriptor[0];
                } else {
                    machineSets[i] = new DatanodeDescriptor[containingNodes.size()];
                    ArrayList<DatanodeDescriptor> containingNodesList =
                      new ArrayList<DatanodeDescriptor>(containingNodes.size());
                    containingNodesList.addAll(containingNodes);
                   
                    machineSets[i] = replicator.sortByDistance(
                        getDatanodeByHost(clientMachine), containingNodesList);
                }
            }

            results[0] = blocks;
            results[1] = machineSets;
        }
        return results;
    }

    /**
     * Set replication for an existing file.
     *
     * The NameNode sets new replication and schedules either replication of
     * under-replicated data blocks or removal of the eccessive block copies
     * if the blocks are over-replicated.
     *
     * @see ClientProtocol#setReplication(String, short)
     * @param src file name
     * @param replication new replication
     * @return true if successful;
     *         false if file does not exist or is a directory
     * @author shv
     */
    public synchronized boolean setReplication(String src,
                                               short replication
                                              ) throws IOException {
      if( isInSafeMode() )
        throw new SafeModeException( "Cannot set replication for " + src, safeMode );
      verifyReplication(src, replication, null );

      Vector<Integer> oldReplication = new Vector<Integer>();
      Block[] fileBlocks;
      fileBlocks = dir.setReplication( src, replication, oldReplication );
      if( fileBlocks == null // file not found or is a directory
        return false;
      int oldRepl = oldReplication.elementAt(0).intValue();
      if( oldRepl == replication ) // the same replication
        return true;

      // update needReplication priority queues
      LOG.info("Increasing replication for file " + src
              + ". New replication is " + replication );
      for( int idx = 0; idx < fileBlocks.length; idx++ )
          neededReplications.update( fileBlocks[idx], 0, replication-oldRepl );
     
      if( oldRepl > replication ) { 
        // old replication > the new one; need to remove copies
        LOG.info("Reducing replication for file " + src
                + ". New replication is " + replication );
        for( int idx = 0; idx < fileBlocks.length; idx++ )
          proccessOverReplicatedBlock( fileBlocks[idx], replication );
      }
      return true;
    }
   
    public long getBlockSize(String filename) throws IOException {
      return dir.getBlockSize(filename);
    }
   
    /**
     * Check whether the replication parameter is within the range
     * determined by system configuration.
     */
    private void verifyReplication( String src,
                                    short replication,
                                    UTF8 clientName
                                  ) throws IOException {
      String text = "file " + src
              + ((clientName != null) ? " on client " + clientName : "")
              + ".\n"
              + "Requested replication " + replication;

      if( replication > maxReplication )
        throw new IOException( text + " exceeds maximum " + maxReplication );
     
      if( replication < minReplication )
        throw new IOException
            text + " is less than the required minimum " + minReplication );
    }
   
    /**
     * The client would like to create a new block for the indicated
     * filename.  Return an array that consists of the block, plus a set
     * of machines.  The first on this list should be where the client
     * writes data.  Subsequent items in the list must be provided in
     * the connection to the first datanode.
     * @return Return an array that consists of the block, plus a set
     * of machines
     * @throws IOException if the filename is invalid
     *         {@link FSDirectory#isValidToCreate(UTF8)}.
     */
    public synchronized Object[] startFile( UTF8 src,
                                            UTF8 holder,
                                            UTF8 clientMachine,
                                            boolean overwrite,
                                            short replication,
                                            long blockSize
                                          ) throws IOException {
      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: file "
            +src+" for "+holder+" at "+clientMachine);
      if( isInSafeMode() )
        throw new SafeModeException( "Cannot create file" + src, safeMode );
      if (!isValidName(src.toString())) {
        throw new IOException("Invalid file name: " + src);         
      }
      try {
        FileUnderConstruction pendingFile = pendingCreates.get(src);
        if (pendingFile != null) {
          //
          // If the file exists in pendingCreate, then it must be in our
          // leases. Find the appropriate lease record.
          //
          Lease lease = leases.get(holder);
          //
          // We found the lease for this file. And surprisingly the original
          // holder is trying to recreate this file. This should never occur.
          //
          if (lease != null) {
            throw new AlreadyBeingCreatedException(
                  "failed to create file " + src + " for " + holder +
                  " on client " + clientMachine +
                  " because current leaseholder is trying to recreate file.");
          }
          //
          // Find the original holder.
          //
          UTF8 oldholder = pendingFile.getClientName();
          lease = leases.get(oldholder);
          if (lease == null) {
            throw new AlreadyBeingCreatedException(
                  "failed to create file " + src + " for " + holder +
                  " on client " + clientMachine +
                  " because pendingCreates is non-null but no leases found.");
          }
          //
          // If the original holder has not renewed in the last SOFTLIMIT
          // period, then reclaim all resources and allow this request
          // to proceed. Otherwise, prevent this request from creating file.
          //
          if (lease.expiredSoftLimit()) {
            lease.releaseLocks();
            leases.remove(lease.holder);
            LOG.info("Removing lease " + lease + " ");
            if (!sortedLeases.remove(lease)) {
              LOG.error("Unknown failure trying to remove " + lease +
                       " from lease set.");
            }
          } else  {
            throw new AlreadyBeingCreatedException(
                  "failed to create file " + src + " for " + holder +
                  " on client " + clientMachine +
                  " because pendingCreates is non-null.");
          }
        }

        try {
           verifyReplication(src.toString(), replication, clientMachine );
        } catch( IOException e) {
            throw new IOException( "failed to create "+e.getMessage());
        }
        if (!dir.isValidToCreate(src)) {
          if (overwrite) {
            delete(src);
          } else {
            throw new IOException("failed to create file " + src
                    +" on client " + clientMachine
                    +" either because the filename is invalid or the file exists");
          }
        }

        // Get the array of replication targets
        DatanodeDescriptor targets[] = replicator.chooseTarget(replication,
            getDatanodeByHost(clientMachine.toString()), null, blockSize);
        if (targets.length < this.minReplication) {
            throw new IOException("failed to create file "+src
                    +" on client " + clientMachine
                    +" because target-length is " + targets.length
                    +", below MIN_REPLICATION (" + minReplication+ ")");
       }

        // Reserve space for this pending file
        pendingCreates.put(src,
                           new FileUnderConstruction(replication,
                                                     blockSize,
                                                     holder,
                                                     clientMachine));
        NameNode.stateChangeLog.debug( "DIR* NameSystem.startFile: "
                   +"add "+src+" to pendingCreates for "+holder );
        synchronized (leases) {
            Lease lease = leases.get(holder);
            if (lease == null) {
                lease = new Lease(holder);
                leases.put(holder, lease);
                sortedLeases.add(lease);
            } else {
                sortedLeases.remove(lease);
                lease.renew();
                sortedLeases.add(lease);
            }
            lease.startedCreate(src);
        }

        // Create next block
        Object results[] = new Object[2];
        results[0] = allocateBlock(src);
        results[1] = targets;
        return results;
      } catch (IOException ie) {
          NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: "
                  +ie.getMessage());
        throw ie;
      }
    }

    /**
     * The client would like to obtain an additional block for the indicated
     * filename (which is being written-to).  Return an array that consists
     * of the block, plus a set of machines.  The first on this list should
     * be where the client writes data.  Subsequent items in the list must
     * be provided in the connection to the first datanode.
     *
     * Make sure the previous blocks have been reported by datanodes and
     * are replicated.  Will return an empty 2-elt array if we want the
     * client to "try again later".
     */
    public synchronized Object[] getAdditionalBlock(UTF8 src,
                                                    UTF8 clientName
                                                    ) throws IOException {
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.getAdditionalBlock: file "
            +src+" for "+clientName);
        if( isInSafeMode() )
          throw new SafeModeException( "Cannot add block to " + src, safeMode );
        FileUnderConstruction pendingFile = pendingCreates.get(src);
        // make sure that we still have the lease on this file
        if (pendingFile == null) {
          throw new LeaseExpiredException("No lease on " + src);
        }
        if (!pendingFile.getClientName().equals(clientName)) {
          throw new LeaseExpiredException("Lease mismatch on " + src +
              " owned by " + pendingFile.getClientName() +
              " and appended by " + clientName);
        }
        if (dir.getFile(src) != null) {
          throw new IOException("File " + src + " created during write");
        }

        //
        // If we fail this, bad things happen!
        //
        if (!checkFileProgress(src)) {
          throw new NotReplicatedYetException("Not replicated yet");
        }
       
        // Get the array of replication targets
        String clientHost = pendingFile.getClientMachine().toString();
        DatanodeDescriptor targets[] = replicator.chooseTarget(
            (int)(pendingFile.getReplication()),
            getDatanodeByHost(clientHost),
            null,
            pendingFile.getBlockSize());
        if (targets.length < this.minReplication) {
          throw new IOException("File " + src + " could only be replicated to " +
                                targets.length + " nodes, instead of " +
                                minReplication);
        }
       
        // Create next block
        return new Object[]{allocateBlock(src), targets};
    }

    /**
     * The client would like to let go of the given block
     */
    public synchronized boolean abandonBlock(Block b, UTF8 src) {
        //
        // Remove the block from the pending creates list
        //
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
                +b.getBlockName()+"of file "+src );
        FileUnderConstruction pendingFile = pendingCreates.get(src);
        if (pendingFile != null) {
            Collection<Block> pendingVector = pendingFile.getBlocks();
            for (Iterator<Block> it = pendingVector.iterator(); it.hasNext(); ) {
                Block cur = it.next();
                if (cur.compareTo(b) == 0) {
                    pendingCreateBlocks.remove(cur);
                    it.remove();
                    NameNode.stateChangeLog.debug(
                             "BLOCK* NameSystem.abandonBlock: "
                            +b.getBlockName()
                            +" is removed from pendingCreateBlock and pendingCreates");
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Abandon the entire file in progress
     */
    public synchronized void abandonFileInProgress(UTF8 src,
                                                   UTF8 holder
                                                   ) throws IOException {
      NameNode.stateChangeLog.debug("DIR* NameSystem.abandonFileInProgress:" + src );
      synchronized (leases) {
        // find the lease
        Lease lease = leases.get(holder);
        if (lease != null) {
          // remove the file from the lease
          if (lease.completedCreate(src)) {
            // if we found the file in the lease, remove it from pendingCreates
            internalReleaseCreate(src, holder);
          } else {
            LOG.info("Attempt by " + holder.toString() +
                " to release someone else's create lock on " +
                src.toString());
          }
        } else {
          LOG.info("Attempt to release a lock from an unknown lease holder "
              + holder.toString() + " for " + src.toString());
        }
      }
    }

    /**
     * Finalize the created file and make it world-accessible.  The
     * FSNamesystem will already know the blocks that make up the file.
     * Before we return, we make sure that all the file's blocks have
     * been reported by datanodes and are replicated correctly.
     */
    public synchronized int completeFile( UTF8 src,
                                          UTF8 holder) throws IOException {
        NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " + src + " for " + holder );
        if( isInSafeMode() )
          throw new SafeModeException( "Cannot complete file " + src, safeMode );
        if (dir.getFile(src) != null || pendingCreates.get(src) == null) {
            NameNode.stateChangeLog.warn( "DIR* NameSystem.completeFile: "
                    + "failed to complete " + src
                    + " because dir.getFile()==" + dir.getFile(src)
                    + " and " + pendingCreates.get(src));
            return OPERATION_FAILED;
        } else if (! checkFileProgress(src)) {
            return STILL_WAITING;
        }
       
        FileUnderConstruction pendingFile = pendingCreates.get(src);
        Collection<Block> blocks = pendingFile.getBlocks();
        int nrBlocks = blocks.size();
        Block pendingBlocks[] = blocks.toArray(new Block[nrBlocks]);

        //
        // We have the pending blocks, but they won't have
        // length info in them (as they were allocated before
        // data-write took place).  So we need to add the correct
        // length info to each
        //
        // REMIND - mjc - this is very inefficient!  We should
        // improve this!
        //
        for (int i = 0; i < nrBlocks; i++) {
            Block b = pendingBlocks[i];
            SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(b);
            DatanodeDescriptor node = containingNodes.first();
            for (Iterator<Block> it = node.getBlockIterator(); it.hasNext(); ) {
                Block cur = it.next();
                if (b.getBlockId() == cur.getBlockId()) {
                    b.setNumBytes(cur.getNumBytes());
                    break;
                }
            }
        }
       
        //
        // Now we can add the (name,blocks) tuple to the filesystem
        //
        if ( ! dir.addFile(src, pendingBlocks, pendingFile.getReplication())) {
          return OPERATION_FAILED;
        }

        // The file is no longer pending
        pendingCreates.remove(src);
        NameNode.stateChangeLog.debug(
             "DIR* NameSystem.completeFile: " + src
           + " is removed from pendingCreates");
        for (int i = 0; i < nrBlocks; i++) {
            pendingCreateBlocks.remove(pendingBlocks[i]);
        }

        synchronized (leases) {
            Lease lease = leases.get(holder);
            if (lease != null) {
                lease.completedCreate(src);
                if (! lease.hasLocks()) {
                    leases.remove(holder);
                    sortedLeases.remove(lease);
                }
            }
        }

        //
        // REMIND - mjc - this should be done only after we wait a few secs.
        // The namenode isn't giving datanodes enough time to report the
        // replicated blocks that are automatically done as part of a client
        // write.
        //

        // Now that the file is real, we need to be sure to replicate
        // the blocks.
        int numExpectedReplicas = pendingFile.getReplication();
        for (int i = 0; i < nrBlocks; i++) {
          SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(pendingBlocks[i]);
          // filter out containingNodes that are marked for decommission.
          int numCurrentReplica = countContainingNodes(containingNodes);

            if (numCurrentReplica < numExpectedReplicas) {
                neededReplications.add(
                      pendingBlocks[i], numCurrentReplica, numExpectedReplicas);
            }
        }
        return COMPLETE_SUCCESS;
    }

    static Random randBlockId = new Random();
   
    /**
     * Allocate a block at the given pending filename
     */
    synchronized Block allocateBlock(UTF8 src) {
        Block b = null;
        do {
            b = new Block(FSNamesystem.randBlockId.nextLong(), 0);
        } while (dir.isValidBlock(b));
        FileUnderConstruction v = pendingCreates.get(src);
        v.getBlocks().add(b);
        pendingCreateBlocks.add(b);
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.allocateBlock: "
            +src+ ". "+b.getBlockName()+
            " is created and added to pendingCreates and pendingCreateBlocks" );     
        return b;
    }

    /**
     * Check that the indicated file's blocks are present and
     * replicated.  If not, return false.
     */
    synchronized boolean checkFileProgress(UTF8 src) {
        FileUnderConstruction v = pendingCreates.get(src);

        for (Iterator<Block> it = v.getBlocks().iterator(); it.hasNext(); ) {
            Block b = it.next();
            SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(b);
            if (containingNodes == null || containingNodes.size() < this.minReplication) {
                return false;
            }
        }
        return true;
    }

    /**
     * Adds block to list of blocks which will be invalidated on
     * specified datanode.
     */
    private void addToInvalidates(Block b, DatanodeInfo n) {
      Collection<Block> invalidateSet = recentInvalidateSets.get(n.getStorageID());
      if (invalidateSet == null) {
        invalidateSet = new ArrayList<Block>();
        recentInvalidateSets.put(n.getStorageID(), invalidateSet);
      }
      invalidateSet.add(b);
    }

    /**
     * Invalidates the given block on the given datanode.
     */
    public synchronized void invalidateBlock(Block blk, DatanodeInfo dn)
        throws IOException {
      NameNode.stateChangeLog.info("DIR* NameSystem.invalidateBlock: "
                                    + blk.getBlockName() + " on "
                                    + dn.getName());
      if (isInSafeMode()) {
        throw new SafeModeException("Cannot invalidate block " + blk.getBlockName(), safeMode);
      }

      Collection<DatanodeDescriptor> containingNodes = blocksMap.get(blk);

      // Check how many copies we have of the block.  If we have at least one
      // copy on a live node, then we can delete it.
      if (containingNodes != null ) {
        if ((countContainingNodes(containingNodes) > 1) ||
            ((countContainingNodes(containingNodes) == 1) &&
             (dn.isDecommissionInProgress() || dn.isDecommissioned()))) {
          addToInvalidates(blk, dn);
          removeStoredBlock(blk, getDatanode(dn));
          NameNode.stateChangeLog.info("BLOCK* NameSystem.invalidateBlocks: "
                                        + blk.getBlockName() + " on "
                                        + dn.getName() + " listed for deletion.");
        } else {
          NameNode.stateChangeLog.info("BLOCK* NameSystem.invalidateBlocks: "
                                        + blk.getBlockName() + " on "
                                        + dn.getName() + " is the only copy and was not deleted.");
        }
      }
    }

    ////////////////////////////////////////////////////////////////
    // Here's how to handle block-copy failure during client write:
    // -- As usual, the client's write should result in a streaming
    // backup write to a k-machine sequence.
    // -- If one of the backup machines fails, no worries.  Fail silently.
    // -- Before client is allowed to close and finalize file, make sure
    // that the blocks are backed up.  Namenode may have to issue specific backup
    // commands to make up for earlier datanode failures.  Once all copies
    // are made, edit namespace and return to client.
    ////////////////////////////////////////////////////////////////

    /**
     * Change the indicated filename.
     */
    public synchronized boolean renameTo(UTF8 src, UTF8 dst) throws IOException {
        NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src + " to " + dst );
        if( isInSafeMode() )
          throw new SafeModeException( "Cannot rename " + src, safeMode );
        if (!isValidName(dst.toString())) {
          throw new IOException("Invalid name: " + dst);
        }
        return dir.renameTo(src, dst);
    }

    /**
     * Remove the indicated filename from the namespace.  This may
     * invalidate some blocks that make up the file.
     */
    public synchronized boolean delete(UTF8 src) throws IOException {
        NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src );
        if( isInSafeMode() )
          throw new SafeModeException( "Cannot delete " + src, safeMode );
        Block deletedBlocks[] = dir.delete(src);
        if (deletedBlocks != null) {
            for (int i = 0; i < deletedBlocks.length; i++) {
                Block b = deletedBlocks[i];

                SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(b);
                if (containingNodes != null) {
                    for (Iterator<DatanodeDescriptor> it = containingNodes.iterator(); it.hasNext(); ) {
                        DatanodeDescriptor node = it.next();
                        addToInvalidates(b, node);
                        NameNode.stateChangeLog.debug("BLOCK* NameSystem.delete: "
                            + b.getBlockName() + " is added to invalidSet of " + node.getName() );
                    }
                }
            }
        }

        return (deletedBlocks != null);
    }

    /**
     * Return whether the given filename exists
     */
    public boolean exists(UTF8 src) {
        if (dir.getFile(src) != null || dir.isDir(src)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Whether the given name is a directory
     */
    public boolean isDir(UTF8 src) {
        return dir.isDir(src);
    }

    /**
     * Whether the pathname is valid.  Currently prohibits relative paths,
     * and names which contain a ":" or "/"
     */
    static boolean isValidName(String src) {
     
      // Path must be absolute.
      if (!src.startsWith(Path.SEPARATOR)) {
        return false;
      }
     
      // Check for ".." "." ":" "/"
      StringTokenizer tokens = new StringTokenizer(src, Path.SEPARATOR);
      while( tokens.hasMoreTokens()) {
        String element = tokens.nextToken();
        if (element.equals("..") ||
            element.equals("."||
            (element.indexOf(":") >= 0||
            (element.indexOf("/") >= 0)) {
          return false;
        }
      }
      return true;
    }
   
    /**
     * Create all the necessary directories
     */
    public synchronized boolean mkdirs( String src ) throws IOException {
        boolean    success;
        NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src );
        if( isInSafeMode() )
          throw new SafeModeException( "Cannot create directory " + src, safeMode );
        if (!isValidName(src)) {
          throw new IOException("Invalid directory name: " + src);
        }
        success = dir.mkdirs(src);
        if (!success) {
          throw new IOException("Invalid directory name: " + src);
        }
        return success;
    }

    /**
     * Figure out a few hosts that are likely to contain the
     * block(s) referred to by the given (filename, start, len) tuple.
     */
    public String[][] getDatanodeHints(String src, long start, long len) {
        if (start < 0 || len < 0) {
            return new String[0][];
        }

        int startBlock = -1;
        int endBlock = -1;
        Block blocks[] = dir.getFile( new UTF8( src ));

        if (blocks == null) {                     // no blocks
            return new String[0][];
        }

        //
        // First, figure out where the range falls in
        // the blocklist.
        //
        long startpos = start;
        long endpos = start + len;
        for (int i = 0; i < blocks.length; i++) {
            if (startpos >= 0) {
                startpos -= blocks[i].getNumBytes();
                if (startpos <= 0) {
                    startBlock = i;
                }
            }
            if (endpos >= 0) {
                endpos -= blocks[i].getNumBytes();
                if (endpos <= 0) {
                    endBlock = i;
                    break;
                }
            }
        }

        //
        // Next, create an array of hosts where each block can
        // be found
        //
        if (startBlock < 0 || endBlock < 0) {
            return new String[0][];
        } else {
          String hosts[][] = new String[(endBlock - startBlock) + 1][];
            for (int i = startBlock; i <= endBlock; i++) {
              SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(blocks[i]);
                Collection<String> v = new ArrayList<String>();
                if (containingNodes != null) {
                  for (Iterator<DatanodeDescriptor> it =containingNodes.iterator(); it.hasNext();) {
                    v.add( it.next().getHost() );
                  }
                }
                hosts[i-startBlock] = v.toArray(new String[v.size()]);
            }
            return hosts;
        }
    }

    /************************************************************
     * A Lease governs all the locks held by a single client.
     * For each client there's a corresponding lease, whose
     * timestamp is updated when the client periodically
     * checks in.  If the client dies and allows its lease to
     * expire, all the corresponding locks can be released.
     *************************************************************/
    class Lease implements Comparable<Lease> {
        public UTF8 holder;
        public long lastUpdate;
        private Collection<UTF8> locks = new TreeSet<UTF8>();
        private Collection<UTF8> creates = new TreeSet<UTF8>();

        public Lease(UTF8 holder) {
            this.holder = holder;
            renew();
        }
        public void renew() {
            this.lastUpdate = now();
        }
        /**
         * Returns true if the Hard Limit Timer has expired
         */
        public boolean expiredHardLimit() {
            if (now() - lastUpdate > LEASE_HARDLIMIT_PERIOD) {
                return true;
            }
            return false;
        }
        /**
         * Returns true if the Soft Limit Timer has expired
         */
        public boolean expiredSoftLimit() {
            if (now() - lastUpdate > LEASE_SOFTLIMIT_PERIOD) {
                return true;
            }
            return false;
        }
        public void obtained(UTF8 src) {
            locks.add(src);
        }
        public void released(UTF8 src) {
            locks.remove(src);
        }
        public void startedCreate(UTF8 src) {
            creates.add(src);
        }
        public boolean completedCreate(UTF8 src) {
            return creates.remove(src);
        }
        public boolean hasLocks() {
            return (locks.size() + creates.size()) > 0;
        }
        public void releaseLocks() {
            for (Iterator<UTF8> it = locks.iterator(); it.hasNext(); )
                internalReleaseLock(it.next(), holder);
            locks.clear();
            for (Iterator<UTF8> it = creates.iterator(); it.hasNext(); )
                internalReleaseCreate(it.next(), holder);
            creates.clear();
        }

        /**
         */
        public String toString() {
            return "[Lease.  Holder: " + holder.toString() + ", heldlocks: " +
                   locks.size() + ", pendingcreates: " + creates.size() + "]";
        }

        /**
         */
        public int compareTo(Lease o) {
            Lease l1 = this;
            Lease l2 = o;
            long lu1 = l1.lastUpdate;
            long lu2 = l2.lastUpdate;
            if (lu1 < lu2) {
                return -1;
            } else if (lu1 > lu2) {
                return 1;
            } else {
                return l1.holder.compareTo(l2.holder);
            }
        }
    }
    /******************************************************
     * LeaseMonitor checks for leases that have expired,
     * and disposes of them.
     ******************************************************/
    class LeaseMonitor implements Runnable {
        public void run() {
            while (fsRunning) {
                synchronized (FSNamesystem.this) {
                    synchronized (leases) {
                        Lease top;
                        while ((sortedLeases.size() > 0) &&
                               ((top = sortedLeases.first()) != null)) {
                            if (top.expiredHardLimit()) {
                                top.releaseLocks();
                                leases.remove(top.holder);
                                LOG.info("Removing lease " + top + ", leases remaining: " + sortedLeases.size());
                                if (!sortedLeases.remove(top)) {
                                    LOG.info("Unknown failure trying to remove " + top + " from lease set.");
                                }
                            } else {
                                break;
                            }
                        }
                    }
                }
                try {
                    Thread.sleep(2000);
                } catch (InterruptedException ie) {
                }
            }
        }
    }

    /**
     * Get a lock (perhaps exclusive) on the given file
     */
    /** @deprecated */ @Deprecated
    public synchronized int obtainLock( UTF8 src,
                                        UTF8 holder,
                                        boolean exclusive) throws IOException {
        if( isInSafeMode() )
          throw new SafeModeException( "Cannot lock file " + src, safeMode );
        int result = dir.obtainLock(src, holder, exclusive);
        if (result == COMPLETE_SUCCESS) {
            synchronized (leases) {
                Lease lease = leases.get(holder);
                if (lease == null) {
                    lease = new Lease(holder);
                    leases.put(holder, lease);
                    sortedLeases.add(lease);
                } else {
                    sortedLeases.remove(lease);
                    lease.renew();
                    sortedLeases.add(lease);
                }
                lease.obtained(src);
            }
        }
        return result;
    }

    /**
     * Release the lock on the given file
     */
    /** @deprecated */ @Deprecated
    public synchronized int releaseLock(UTF8 src, UTF8 holder) {
        int result = internalReleaseLock(src, holder);
        if (result == COMPLETE_SUCCESS) {
            synchronized (leases) {
                Lease lease = leases.get(holder);
                if (lease != null) {
                    lease.released(src);
                    if (! lease.hasLocks()) {
                        leases.remove(holder);
                        sortedLeases.remove(lease);
                    }
                }
            }
        }
        return result;
    }
    private int internalReleaseLock(UTF8 src, UTF8 holder) {
        return dir.releaseLock(src, holder);
    }

    /**
     * Release a pending file creation lock.
     * @param src The filename
     * @param holder The datanode that was creating the file
     */
    private void internalReleaseCreate(UTF8 src, UTF8 holder) {
      FileUnderConstruction v = pendingCreates.remove(src);
      if (v != null) {
         NameNode.stateChangeLog.debug(
                      "DIR* NameSystem.internalReleaseCreate: " + src
                    + " is removed from pendingCreates for "
                    + holder + " (failure)");
        for (Iterator<Block> it2 = v.getBlocks().iterator(); it2.hasNext(); ) {
          Block b = it2.next();
          pendingCreateBlocks.remove(b);
        }
      } else {
          NameNode.stateChangeLog.warn("DIR* NameSystem.internalReleaseCreate: "
                 + "attempt to release a create lock on "+ src.toString()
                 + " that was not in pedingCreates");
      }
    }

    /**
     * Renew the lease(s) held by the given client
     */
    public void renewLease(UTF8 holder) throws IOException {
        synchronized (leases) {
            if( isInSafeMode() )
              throw new SafeModeException( "Cannot renew lease for " + holder, safeMode );
            Lease lease = leases.get(holder);
            if (lease != null) {
                sortedLeases.remove(lease);
                lease.renew();
                sortedLeases.add(lease);
            }
        }
    }

    /**
     * Get a listing of all files at 'src'.  The Object[] array
     * exists so we can return file attributes (soon to be implemented)
     */
    public DFSFileInfo[] getListing(UTF8 src) {
        return dir.getListing(src);
    }

    /////////////////////////////////////////////////////////
    //
    // These methods are called by datanodes
    //
    /////////////////////////////////////////////////////////
    /**
     * Register Datanode.
     * <p>
     * The purpose of registration is to identify whether the new datanode
     * serves a new data storage, and will report new data block copies,
     * which the namenode was not aware of; or the datanode is a replacement
     * node for the data storage that was previously served by a different
     * or the same (in terms of host:port) datanode.
     * The data storages are distinguished by their storageIDs. When a new
     * data storage is reported the namenode issues a new unique storageID.
     * <p>
     * Finally, the namenode returns its namespaceID as the registrationID
     * for the datanodes.
     * namespaceID is a persistent attribute of the name space.
     * The registrationID is checked every time the datanode is communicating
     * with the namenode.
     * Datanodes with inappropriate registrationID are rejected.
     * If the namenode stops, and then restarts it can restore its
     * namespaceID and will continue serving the datanodes that has previously
     * registered with the namenode without restarting the whole cluster.
     *
     * @see DataNode#register()
     * @author Konstantin Shvachko
     */
    public synchronized void registerDatanode( DatanodeRegistration nodeReg,
                                               String networkLocation
                                              ) throws IOException {
      NameNode.stateChangeLog.info(
          "BLOCK* NameSystem.registerDatanode: "
          + "node registration from " + nodeReg.getName()
          + " storage " + nodeReg.getStorageID() );

      nodeReg.registrationID = getRegistrationID();
      DatanodeDescriptor nodeS = datanodeMap.get(nodeReg.getStorageID());
      DatanodeDescriptor nodeN = getDatanodeByName( nodeReg.getName() );
     
      if( nodeN != null && nodeN != nodeS ) {
          NameNode.LOG.info( "BLOCK* NameSystem.registerDatanode: "
                  + "node from name: " + nodeN.getName() );
        // nodeN previously served a different data storage,
        // which is not served by anybody anymore.
        removeDatanode( nodeN );
        // physically remove node from datanodeMap
        wipeDatanode( nodeN );
        // and log removal
        getEditLog().logRemoveDatanode( nodeN );
        nodeN = null;
      }

      if ( nodeS != null ) {
        if( nodeN == nodeS ) {
          // The same datanode has been just restarted to serve the same data
          // storage. We do not need to remove old data blocks, the delta will
          // be calculated on the next block report from the datanode
          NameNode.stateChangeLog.debug("BLOCK* NameSystem.registerDatanode: "
                                        + "node restarted." );
        } else {
          // nodeS is found
          // The registering datanode is a replacement node for the existing
          // data storage, which from now on will be served by a new node.
          NameNode.stateChangeLog.debug(
            "BLOCK* NameSystem.registerDatanode: "
            + "node " + nodeS.getName()
            + " is replaced by " + nodeReg.getName() + "." );
        }
        getEditLog().logRemoveDatanode( nodeS );
        // update cluster map
        clusterMap.remove( nodeS );
        nodeS.updateRegInfo( nodeReg );
        nodeS.setNetworkLocation( networkLocation );
        clusterMap.add( nodeS );
        getEditLog().logAddDatanode( nodeS );
       
        // also treat the registration message as a heartbeat
        synchronized( heartbeats ) {
            heartbeats.add( nodeS );
            //update its timestamp
            nodeS.updateHeartbeat( 0L, 0L, 0);
            nodeS.isAlive = true;
        }
        return;
      }

      // this is a new datanode serving a new data storage
      if( nodeReg.getStorageID().equals("") ) {
        // this data storage has never been registered
        // it is either empty or was created by pre-storageID version of DFS
        nodeReg.storageID = newStorageID();
        NameNode.stateChangeLog.debug(
            "BLOCK* NameSystem.registerDatanode: "
            + "new storageID " + nodeReg.getStorageID() + " assigned." );
      }
      // register new datanode
      DatanodeDescriptor nodeDescr
                  = new DatanodeDescriptor( nodeReg, networkLocation );
      unprotectedAddDatanode( nodeDescr );
      getEditLog().logAddDatanode( nodeDescr );
     
      // also treat the registration message as a heartbeat
      synchronized( heartbeats ) {
          heartbeats.add( nodeDescr );
          nodeDescr.isAlive = true;
          // no need to update its timestamp
          // because its is done when the descriptor is created
      }
      return;
    }
   
    /**
     * Get registrationID for datanodes based on the namespaceID.
     *
     * @see #registerDatanode(DatanodeRegistration)
     * @see FSImage#newNamespaceID()
     * @return registration ID
     */
    public String getRegistrationID() {
      return "NS" + Integer.toString( dir.namespaceID );
    }
   
    /**
     * Generate new storage ID.
     *
     * @return unique storage ID
     *
     * Note: that collisions are still possible if somebody will try
     * to bring in a data storage from a different cluster.
     */
    private String newStorageID() {
      String newID = null;
      while( newID == null ) {
        newID = "DS" + Integer.toString( r.nextInt() );
        if( datanodeMap.get( newID ) != null )
          newID = null;
      }
      return newID;
    }
   
    private boolean isDatanodeDead(DatanodeDescriptor node) {
      return (node.getLastUpdate() <
          (System.currentTimeMillis() - heartbeatExpireInterval));
    }
   
    /**
     * The given node has reported in.  This method should:
     * 1) Record the heartbeat, so the datanode isn't timed out
     * 2) Adjust usage stats for future block allocation
     *
     * If a substantial amount of time passed since the last datanode
     * heartbeat then request an immediate block report. 
     *
     * @return true if block report is required or false otherwise.
     * @throws IOException
     */
    public boolean gotHeartbeat( DatanodeID nodeID,
                                 long capacity,
                                 long remaining,
                                 int xceiverCount
                                 ) throws IOException {
      synchronized (heartbeats) {
        synchronized (datanodeMap) {
          DatanodeDescriptor nodeinfo;
          try {
            nodeinfo = getDatanode( nodeID );
            if (nodeinfo == null ) {
                return true;
            }
          } catch(UnregisteredDatanodeException e) {
              return true;
          }
         
          if( !nodeinfo.isAlive ) {
              return true;
          } else {
              updateStats(nodeinfo, false);
              nodeinfo.updateHeartbeat(capacity, remaining, xceiverCount);
              updateStats(nodeinfo, true);
              return false;
          }
        }
      }
    }

    private void updateStats(DatanodeDescriptor node, boolean isAdded) {
      //
      // The statistics are protected by the heartbeat lock
      //
      assert(Thread.holdsLock(heartbeats));
      if (isAdded) {
        totalCapacity += node.getCapacity();
        totalRemaining += node.getRemaining();
        totalLoad += node.getXceiverCount();
      } else {
        totalCapacity -= node.getCapacity();
        totalRemaining -= node.getRemaining();
        totalLoad -= node.getXceiverCount();
      }
    }
    /**
     * Periodically calls heartbeatCheck().
     */
    class HeartbeatMonitor implements Runnable {
        /**
         */
        public void run() {
            while (fsRunning) {
                heartbeatCheck();
                try {
                    Thread.sleep(heartbeatRecheckInterval);
                } catch (InterruptedException ie) {
                }
            }
        }
    }

    /**
     * remove a datanode descriptor
     * @param nodeID datanode ID
     * @author hairong
     */
    synchronized public void removeDatanode( DatanodeID nodeID )
    throws IOException {
      DatanodeDescriptor nodeInfo = getDatanode( nodeID );
      if (nodeInfo != null) {
        removeDatanode( nodeInfo );
      } else {
          NameNode.stateChangeLog.warn("BLOCK* NameSystem.removeDatanode: "
                  + nodeInfo.getName() + " does not exist");
      }
  }
 
  /**
   * remove a datanode descriptor
   * @param nodeInfo datanode descriptor
   * @author hairong
   */
    private void removeDatanode( DatanodeDescriptor nodeInfo ) {
      if (nodeInfo.isAlive) {
        updateStats(nodeInfo, false);
        heartbeats.remove(nodeInfo);
        nodeInfo.isAlive = false;
      }

      for (Iterator<Block> it = nodeInfo.getBlockIterator(); it.hasNext(); ) {
          removeStoredBlock(it.next(), nodeInfo);
      }
      unprotectedRemoveDatanode(nodeInfo);
      clusterMap.remove(nodeInfo);
    }

    void unprotectedRemoveDatanode( DatanodeDescriptor nodeDescr ) {
      // datanodeMap.remove(nodeDescr.getStorageID());
      // deaddatanodeMap.put(nodeDescr.getName(), nodeDescr);
      nodeDescr.resetBlocks();
      NameNode.stateChangeLog.debug(
          "BLOCK* NameSystem.unprotectedRemoveDatanode: "
          + nodeDescr.getName() + " is out of service now.");
    }
   
    void unprotectedAddDatanode( DatanodeDescriptor nodeDescr ) {
      datanodeMap.put( nodeDescr.getStorageID(), nodeDescr );
      clusterMap.add(nodeDescr);
      NameNode.stateChangeLog.debug(
          "BLOCK* NameSystem.unprotectedAddDatanode: "
          + "node " + nodeDescr.getName() + " is added to datanodeMap." );
    }

   
    /**
     * Physically remove node from datanodeMap.
     *
     * @param nodeID node
     */
    void wipeDatanode( DatanodeID nodeID ) {
      String key = nodeID.getStorageID();
      datanodeMap.remove(key);
      NameNode.stateChangeLog.debug(
          "BLOCK* NameSystem.wipeDatanode: "
          + nodeID.getName() + " storage " + nodeID.getStorageID()
          + " is removed from datanodeMap.");
    }
   
    private FSEditLog getEditLog() {
      return dir.fsImage.getEditLog();
    }

    /**
     * Check if there are any expired heartbeats, and if so,
     * whether any blocks have to be re-replicated.
     * While removing dead datanodes, make sure that only one datanode is marked
     * dead at a time within the synchronized section. Otherwise, a cascading
     * effect causes more datanodes to be declared dead.
     */
    void heartbeatCheck() {
      boolean allAlive = false;
      while (!allAlive) {
        boolean foundDead = false;
        DatanodeID nodeID = null;

        // locate the first dead node.
        synchronized(heartbeats) {
            for (Iterator<DatanodeDescriptor> it = heartbeats.iterator();
            it.hasNext();) {
              DatanodeDescriptor nodeInfo = it.next();
              if (isDatanodeDead(nodeInfo)) {
                foundDead = true;
                nodeID = nodeInfo;
                break;
              }
            }
        }

        // acquire the fsnamesystem lock, and then remove the dead node.
        if (foundDead) {
          synchronized (this) {
            synchronized(heartbeats) {
              synchronized (datanodeMap) {
                DatanodeDescriptor nodeInfo = null;
                try {
                  nodeInfo = getDatanode(nodeID);
                } catch (IOException e) {
                  nodeInfo = null;
                }
                if (nodeInfo != null && isDatanodeDead(nodeInfo)) {
                  NameNode.stateChangeLog.info("BLOCK* NameSystem.heartbeatCheck: "
                    + "lost heartbeat from " + nodeInfo.getName());
                  removeDatanode(nodeInfo);
                }
              }
            }
          }
        }
        allAlive = ! foundDead;
      }
    }
   
    /**
     * The given node is reporting all its blocks.  Use this info to
     * update the (machine-->blocklist) and (block-->machinelist) tables.
     */
    public synchronized Block[] processReport(DatanodeID nodeID,
                                              Block newReport[]
                                            ) throws IOException {
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.processReport: "
          +"from "+nodeID.getName()+" "+newReport.length+" blocks" );
        DatanodeDescriptor node = getDatanode( nodeID );

        //
        // Modify the (block-->datanode) map, according to the difference
        // between the old and new block report.
        //
        int newPos = 0;
        boolean modified = false;
        Iterator<Block> iter = node.getBlockIterator();
        Block oldblk = iter.hasNext() ? iter.next() : null;
        Block newblk = (newReport != null && newReport.length > 0) ?
                        newReport[0: null;

        while (oldblk != null || newblk != null) {
          
            int cmp = (oldblk == null) ? 1 :
                       ((newblk == null) ? -1 : oldblk.compareTo(newblk));

            if (cmp == 0) {
                // Do nothing, blocks are the same
                newPos++;
                oldblk = iter.hasNext() ? iter.next() : null;
                newblk = (newPos < newReport.length)
                         ? newReport[newPos] : null;
            } else if (cmp < 0) {
                // The old report has a block the new one does not
                removeStoredBlock(oldblk, node);
                modified = true;
                oldblk = iter.hasNext() ? iter.next() : null;
            } else {
                // The new report has a block the old one does not
                addStoredBlock(newblk, node);
                modified = true;
                newPos++;
                newblk = (newPos < newReport.length)
                         ? newReport[newPos] : null;
            }
        }
        //
        // Modify node so it has the new blockreport
        //
        if (modified) {
            node.updateBlocks(newReport);
        }

        //
        // We've now completely updated the node's block report profile.
        // We now go through all its blocks and find which ones are invalid,
        // no longer pending, or over-replicated.
        //
        // (Note it's not enough to just invalidate blocks at lease expiry
        // time; datanodes can go down before the client's lease on
        // the failed file expires and miss the "expire" event.)
        //
        // This function considers every block on a datanode, and thus
        // should only be invoked infrequently.
        //
        Collection<Block> obsolete = new ArrayList<Block>();
        for (Iterator<Block> it = node.getBlockIterator(); it.hasNext(); ) {
            Block b = it.next();

            if (! dir.isValidBlock(b) && ! pendingCreateBlocks.contains(b)) {
                obsolete.add(b);
                NameNode.stateChangeLog.debug("BLOCK* NameSystem.processReport: "
                        +"ask "+nodeID.getName()+" to delete "+b.getBlockName() );
            }
        }
        return (Block[]) obsolete.toArray(new Block[obsolete.size()]);
    }

    /**
     * Modify (block-->datanode) map.  Remove block from set of
     * needed replications if this takes care of the problem.
     */
    synchronized void addStoredBlock(Block block, DatanodeDescriptor node) {
      SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(block);
        if (containingNodes == null) {
            containingNodes = new TreeSet<DatanodeDescriptor>();
            blocksMap.put(block, containingNodes);
        }
        int curReplicaDelta = 0;
        if (! containingNodes.contains(node)) {
            containingNodes.add(node);
            curReplicaDelta = 1;
            //
            // Hairong: I would prefer to set the level of next logrecord
            // to be debug.
            // But at startup time, because too many new blocks come in
            // they simply take up all the space in the log file
            // So I set the level to be trace
            //
            NameNode.stateChangeLog.trace("BLOCK* NameSystem.addStoredBlock: "
                    +"blockMap updated: "+node.getName()+" is added to "+block.getBlockName() );
        } else {
            NameNode.stateChangeLog.warn("BLOCK* NameSystem.addStoredBlock: "
                    + "Redundant addStoredBlock request received for "
                    + block.getBlockName() + " on " + node.getName());
        }

        FSDirectory.INode fileINode = dir.getFileByBlock(block);
        if( fileINode == null // block does not belong to any file
            return;
       
        // filter out containingNodes that are marked for decommission.
        int numCurrentReplica = countContainingNodes(containingNodes);
       
        // check whether safe replication is reached for the block
        // only if it is a part of a files
        incrementSafeBlockCount( numCurrentReplica );
       
        // handle underReplication/overReplication
        short fileReplication = fileINode.getReplication();
        neededReplications.update(block, curReplicaDelta, 0);
        if (numCurrentReplica >= fileReplication ) {
            pendingReplications.remove(block);
        }       
        proccessOverReplicatedBlock( block, fileReplication );
    }
   
    /**
     * Find how many of the containing nodes are "extra", if any.
     * If there are any extras, call chooseExcessReplicates() to
     * mark them in the excessReplicateMap.
     */
    private void proccessOverReplicatedBlock( Block block, short replication ) {
      SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(block);
      if( containingNodes == null )
        return;
      Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>();
      for (Iterator<DatanodeDescriptor> it = containingNodes.iterator(); it.hasNext(); ) {
          DatanodeDescriptor cur = it.next();
          Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());
          if (excessBlocks == null || ! excessBlocks.contains(block)) {
            if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
              nonExcess.add(cur);
            }
          }
      }
      chooseExcessReplicates(nonExcess, block, replication);   
    }

    /**
     * We want "replication" replicates for the block, but we now have too many. 
     * In this method, copy enough nodes from 'srcNodes' into 'dstNodes' such that:
     *
     * srcNodes.size() - dstNodes.size() == replication
     *
     * We pick node with least free space
     * In the future, we might enforce some kind of policy
     * (like making sure replicates are spread across racks).
     */
    void chooseExcessReplicates(Collection<DatanodeDescriptor> nonExcess,
                                Block b, short replication) {
        while (nonExcess.size() - replication > 0) {
            DatanodeInfo cur = null;
            long minSpace = Long.MAX_VALUE;
           
            for (Iterator<DatanodeDescriptor> iter = nonExcess.iterator(); iter.hasNext();) {
                DatanodeInfo node = iter.next();
                long free = node.getRemaining();
               
                if(minSpace > free) {
                    minSpace = free;
                    cur = node;
                }
            }
           
            nonExcess.remove(cur);

            Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());
            if (excessBlocks == null) {
                excessBlocks = new TreeSet<Block>();
                excessReplicateMap.put(cur.getStorageID(), excessBlocks);
            }
            excessBlocks.add(b);
            NameNode.stateChangeLog.debug("BLOCK* NameSystem.chooseExcessReplicates: "
                    +"("+cur.getName()+", "+b.getBlockName()+") is added to excessReplicateMap" );

            //
            // The 'excessblocks' tracks blocks until we get confirmation
            // that the datanode has deleted them; the only way we remove them
            // is when we get a "removeBlock" message. 
            //
            // The 'invalidate' list is used to inform the datanode the block
            // should be deleted.  Items are removed from the invalidate list
            // upon giving instructions to the namenode.
            //
            Collection<Block> invalidateSet = recentInvalidateSets.get(cur.getStorageID());
            if (invalidateSet == null) {
                invalidateSet = new ArrayList<Block>();
                recentInvalidateSets.put(cur.getStorageID(), invalidateSet);
            }
            invalidateSet.add(b);
            NameNode.stateChangeLog.debug("BLOCK* NameSystem.chooseExcessReplicates: "
                    +"("+cur.getName()+", "+b.getBlockName()+") is added to recentInvalidateSets" );
        }
    }

    /**
     * Modify (block-->datanode) map.  Possibly generate
     * replication tasks, if the removed block is still valid.
     */
    synchronized void removeStoredBlock(Block block, DatanodeDescriptor node) {
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
                +block.getBlockName() + " from "+node.getName() );
        SortedSet<DatanodeDescriptor> containingNodes = blocksMap.get(block);
        if (containingNodes == null || ! containingNodes.contains(node)) {
          NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
            +block.getBlockName()+" has already been removed from node "+node );
          return;
        }
        containingNodes.remove(node);
       
        // filter out containingNodes that are marked for decommission.
        int numCurrentReplica = countContainingNodes(containingNodes);

        decrementSafeBlockCount( numCurrentReplica );
        if( containingNodes.isEmpty() )
          blocksMap.remove(block);
        //
        // It's possible that the block was removed because of a datanode
        // failure.  If the block is still valid, check if replication is
        // necessary.  In that case, put block on a possibly-will-
        // be-replicated list.
        //
        FSDirectory.INode fileINode = dir.getFileByBlock(block);
        if( fileINode != null ) {
            neededReplications.update(block, -1, 0);
        }

        //
        // We've removed a block from a node, so it's definitely no longer
        // in "excess" there.
        //
        Collection<Block> excessBlocks = excessReplicateMap.get(node.getStorageID());
        if (excessBlocks != null) {
            excessBlocks.remove(block);
            NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
                    +block.getBlockName()+" is removed from excessBlocks" );
            if (excessBlocks.size() == 0) {
                excessReplicateMap.remove(node.getStorageID());
            }
        }
    }

    /**
     * The given node is reporting that it received a certain block.
     */
    public synchronized void blockReceived( DatanodeID nodeID, 
                                            Block block
                                          ) throws IOException {
        DatanodeDescriptor node = getDatanode( nodeID );
        if (node == null) {
            NameNode.stateChangeLog.warn("BLOCK* NameSystem.blockReceived: "
             + block.getBlockName() + " is received from an unrecorded node "
             + nodeID.getName() );
            throw new IllegalArgumentException(
                "Unexpected exception.  Got blockReceived message from node "
                + block.getBlockName() + ", but there is no info for it");
        }
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.blockReceived: "
                +block.getBlockName()+" is received from " + nodeID.getName() );
        //
        // Modify the blocks->datanode map
        //
        addStoredBlock(block, node);

        //
        // Supplement node's blockreport
        //
        node.addBlock(block);
    }

    /**
     * Total raw bytes.
     */
    public long totalCapacity() {

      synchronized (heartbeats) {
        return totalCapacity;
      }
    }

    /**
     * Total non-used raw bytes.
     */
    public long totalRemaining() {
      synchronized (heartbeats) {
        return totalRemaining;
      }
    }

    /**
     * Total number of connections.
     */
    public int totalLoad() {
      synchronized (heartbeats) {
        return totalLoad;
      }
    }

    public synchronized DatanodeInfo[] datanodeReport() {
      DatanodeInfo results[] = null;
        synchronized (datanodeMap) {
            results = new DatanodeInfo[datanodeMap.size()];
            int i = 0;
            for(Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext(); )
              results[i++] = new DatanodeInfo( it.next() );
        }
        return results;
    }
   
    /**
     */
    public synchronized void DFSNodesStatus( ArrayList<DatanodeDescriptor> live,
                                             ArrayList<DatanodeDescriptor> dead ) {
      synchronized (datanodeMap) {
          for(Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext(); ) {
            DatanodeDescriptor node = it.next();
            if( isDatanodeDead(node))
              dead.add( node );
            else
              live.add( node );
          }
      }
    }

    /**
     * Start decommissioning the specified datanodes. If a datanode is
     * already being decommissioned, then this is a no-op.
     */
    public synchronized void startDecommission (String[] nodes)
                             throws IOException {
      if (isInSafeMode()) {
        throw new SafeModeException("Cannot decommission node ", safeMode);
      }
      boolean isError = false;
      String badnodes = "";

      synchronized (datanodeMap) {
        for (int i = 0; i < nodes.length; i++) {
          boolean found = false;
          for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator();
               it.hasNext(); ) {
            DatanodeDescriptor node = it.next();

            //
            // If this is a node that we are interested in, set its admin state.
            //
            if (node.getName().equals(nodes[i]) ||
                node.getHost().equals(nodes[i])) {
              found = true;
              if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
                LOG.info("Start Decommissioning node " + node.name);
                node.startDecommission();
                //
                // all those blocks that resides on this node has to be
                // replicated.
                Block decommissionBlocks[] = node.getBlocks();
                for (int j = 0; j < decommissionBlocks.length; j++) {
                    neededReplications.update(decommissionBlocks[j], -1, 0);
                }
              }
              break;
            }
          }
          //
          // Record the fact that a specified node was not found
          //
          if (!found) {
            badnodes += nodes[i] + " ";
            isError = true;
          }
        }
      }
      if (isError) {
        throw new IOException("Nodes " + badnodes + " not found");
      }
    }

    /**
     * Stop decommissioning the specified datanodes.
     */
    public synchronized void stopDecommission (String[] nodes)
                             throws IOException {
      if (isInSafeMode()) {
        throw new SafeModeException("Cannot decommission node ", safeMode);
      }
      boolean isError = false;
      String badnodes = "";

      synchronized (datanodeMap) {
        for (int i = 0; i < nodes.length; i++) {
          boolean found = false;
          for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator();
               it.hasNext(); ) {
            DatanodeDescriptor node = it.next();

            //
            // If this is a node that we are interested in, set its admin state.
            //
            if (node.getName().equals(nodes[i]) ||
                node.getHost().equals(nodes[i])) {
              LOG.info("Stop Decommissioning node " + node.name);
              found = true;
              node.stopDecommission();
              break;
            }
          }
          //
          // Record the fact that a specified node was not found
          //
          if (!found) {
            badnodes += nodes[i] + " ";
            isError = true;
          }
        }
      }
      if (isError) {
        throw new IOException("Nodes " + badnodes + " not found");
      }
    }

    /**
     * Return true if all specified nodes are decommissioned.
     * Otherwise return false.
     */
    public synchronized boolean checkDecommissioned (String[] nodes)
                                   throws IOException {
      String badnodes = "";
      boolean isError = false;

      synchronized (datanodeMap) {
        for (int i = 0; i < nodes.length; i++) {
          boolean found = false;
          for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator();
               it.hasNext(); ) {
            DatanodeDescriptor node = it.next();

            //
            // If this is a node that we are interested in, check its admin state.
            //
            if (node.getName().equals(nodes[i]) ||
                node.getHost().equals(nodes[i])) {
              found = true;
              boolean isDecommissioned = checkDecommissionStateInternal(node);
              if (!isDecommissioned) {
                return false;
              }
            }
          }
          if (!found) {
            badnodes += nodes[i] + " ";
            isError = true;
          }
        }
      }
      if (isError) {
        throw new IOException("Nodes " + badnodes + " not found");
      }
      return true;
    }

    /**
     */
    public DatanodeInfo getDataNodeInfo(String name) {
        return datanodeMap.get(name);
    }
    /**
     */
    public String getDFSNameNodeMachine() {
        return localMachine;
    }
    /**
     */
    public int getDFSNameNodePort() {
        return port;
    }
    /**
     */
    public Date getStartTime() {
        return startTime;
    }
    /////////////////////////////////////////////////////////
    //
    // These methods are called by the Namenode system, to see
    // if there is any work for a given datanode.
    //
    /////////////////////////////////////////////////////////

    /**
     * Check if there are any recently-deleted blocks a datanode should remove.
     */
    public synchronized Block[] blocksToInvalidate( DatanodeID nodeID ) {
        // Ask datanodes to perform block delete 
        // only if safe mode is off.
        if( isInSafeMode() )
          return null;
      
        Collection<Block> invalidateSet = recentInvalidateSets.remove(
                                                      nodeID.getStorageID() );
        if (invalidateSet == null) {
            return null;
        }

        Iterator<Block> it = null;
        int sendNum = invalidateSet.size();
        int origSize = sendNum;
        ArrayList sendBlock = new ArrayList(sendNum);

        //
        // calculate the number of blocks that we send in one message
        //
        if (sendNum > FSConstants.BLOCK_INVALIDATE_CHUNK) {
            sendNum =  FSConstants.BLOCK_INVALIDATE_CHUNK;
        }
        //
        // Copy the first chunk into sendBlock
        //
        for (it = invalidateSet.iterator(); sendNum > 0; sendNum--) {
            assert(it.hasNext());
            sendBlock.add(it.next());
            it.remove();
        }

        //
        // If we could not send everything in this message, reinsert this item
        // into the collection.
        //
        if (it.hasNext()) {
            assert(origSize > FSConstants.BLOCK_INVALIDATE_CHUNK);
            recentInvalidateSets.put(nodeID.getStorageID(), invalidateSet);
        }
       
        if (NameNode.stateChangeLog.isInfoEnabled()) {
            StringBuffer blockList = new StringBuffer();
            for (int i = 0; i < sendBlock.size(); i++) {
                blockList.append(' ');
                Block block = (Block) sendBlock.get(i);
                blockList.append(block.getBlockName());
            }
            NameNode.stateChangeLog.debug("BLOCK* NameSystem.blockToInvalidate: "
                   +"ask "+nodeID.getName()+" to delete " + blockList );
        }
        return (Block[]) sendBlock.toArray(new Block[sendBlock.size()]);
    }

    /*
     * Counts the number of nodes in the given list. Skips over nodes
     * that are marked for decommission.
     */
    private int countContainingNodes(Collection<DatanodeDescriptor> nodelist) {
      int count = 0;
      for (Iterator<DatanodeDescriptor> it = nodelist.iterator();
           it.hasNext(); ) {
        DatanodeDescriptor node = it.next();
        if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
          count++;
        }
      }
      return count;
    }

    /*
     * Filter nodes that are marked for decommison in the given list.
     * Return a list of non-decommissioned nodes
     */
    private List<DatanodeDescriptor> filterDecommissionedNodes(
        Collection<DatanodeDescriptor> nodelist) {
      List<DatanodeDescriptor> nonCommissionedNodeList =
        new ArrayList<DatanodeDescriptor>();
      for (Iterator<DatanodeDescriptor> it = nodelist.iterator();
           it.hasNext(); ) {
        DatanodeDescriptor node = it.next();
        if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
          nonCommissionedNodeList.add(node);
        }
      }
      return nonCommissionedNodeList;
    }
    /*
     * Return true if there are any blocks in neededReplication that
     * reside on the specified node. Otherwise returns false.
     */
    private boolean isReplicationInProgress(DatanodeDescriptor srcNode) {
        for (Iterator<Block> it = neededReplications.iterator(); it.hasNext();){
            Block block = it.next();
            Collection<DatanodeDescriptor> containingNodes = blocksMap.get(block);
            if (containingNodes.contains(srcNode)) {
                return true;
            }
        }
      return false;
    }

    /**
     * Change, if appropriate, the admin state of a datanode to
     * decommission completed. Return true if decommission is complete.
     */
    private boolean checkDecommissionStateInternal(DatanodeDescriptor node) {
      //
      // Check to see if there are any blocks in the neededReplication
      // data structure that has a replica on the node being decommissioned.
      //
      if (node.isDecommissionInProgress()) {
        if (!isReplicationInProgress(node)) {
          node.setDecommissioned();
          LOG.info("Decommission complete for node " + node.name);
        }
      }
      if (node.isDecommissioned()) {
        return true;
      }
      return false;
    }

    /**
     * Change, if appropriate, the admin state of a datanode to
     * decommission completed.
     */
    public synchronized void checkDecommissionState(DatanodeID nodeReg) {
      DatanodeDescriptor node = datanodeMap.get(nodeReg.getStorageID());
      if (node == null) {
        return;
      }
      checkDecommissionStateInternal(node);
    }

    /**
     * Return with a list of Block/DataNodeInfo sets, indicating
     * where various Blocks should be copied, ASAP.
     *
     * The Array that we return consists of two objects:
     * The 1st elt is an array of Blocks.
     * The 2nd elt is a 2D array of DatanodeDescriptor objs, identifying the
     *     target sequence for the Block at the appropriate index.
     *
     */
    public synchronized Object[] pendingTransfers(DatanodeID srcNode,
                                                  int xmitsInProgress) {
    // Ask datanodes to perform block replication 
    // only if safe mode is off.
    if( isInSafeMode() )
      return null;
   
    synchronized (neededReplications) {
      Object results[] = null;
      int scheduledXfers = 0;

      if (neededReplications.size() > 0) {
        //
        // Go through all blocks that need replications. See if any
        // are present at the current node. If so, ask the node to
        // replicate them.
        //
        List<Block> replicateBlocks = new ArrayList<Block>();
        List<Integer> numCurrentReplicas = new ArrayList<Integer>();
        List<DatanodeDescriptor[]> replicateTargetSets;
        replicateTargetSets = new ArrayList<DatanodeDescriptor[]>();
        for (Iterator<Block> it = neededReplications.iterator(); it.hasNext();) {
          //
          // We can only reply with 'maxXfers' or fewer blocks
          //
          if (scheduledXfers >= this.maxReplicationStreams - xmitsInProgress) {
            break;
          }

          Block block = it.next();
          long blockSize = block.getNumBytes();
          FSDirectory.INode fileINode = dir.getFileByBlock(block);
          if (fileINode == null) { // block does not belong to any file
            it.remove();
          } else {
            Collection<DatanodeDescriptor> containingNodes = blocksMap.get(block);
            Collection<Block> excessBlocks = excessReplicateMap.get(
                                                      srcNode.getStorageID() );

            // srcNode must contain the block, and the block must
            // not be scheduled for removal on that node
            if (containingNodes != null && containingNodes.contains(srcNode)
                && (excessBlocks == null || ! excessBlocks.contains(block))) {
              // filter out containingNodes that are marked for decommission.
              List<DatanodeDescriptor> nodes =
                filterDecommissionedNodes(containingNodes);
              int numCurrentReplica = nodes.size();
              DatanodeDescriptor targets[] = replicator.chooseTarget(
                  Math.min( fileINode.getReplication() - numCurrentReplica,
                            this.maxReplicationStreams - xmitsInProgress),
                  datanodeMap.get(srcNode.getStorageID()),
                  nodes, null, blockSize);
              if (targets.length > 0) {
                // Build items to return
                replicateBlocks.add(block);
                numCurrentReplicas.add(new Integer(numCurrentReplica));
                replicateTargetSets.add(targets);
                scheduledXfers += targets.length;
              }
            }
          }
        }

        //
        // Move the block-replication into a "pending" state.
        // The reason we use 'pending' is so we can retry
        // replications that fail after an appropriate amount of time.
        // (REMIND - mjc - this timer is not yet implemented.)
        //
        if (replicateBlocks.size() > 0) {
          int i = 0;
          for (Iterator<Block> it = replicateBlocks.iterator(); it.hasNext(); i++) {
            Block block = it.next();
            DatanodeDescriptor targets[] =
                      (DatanodeDescriptor[]) replicateTargetSets.get(i);
            int numCurrentReplica = numCurrentReplicas.get(i).intValue();
            int numExpectedReplica = dir.getFileByBlock( block).getReplication();
            neededReplications.update(
                    block, numCurrentReplica, numExpectedReplica);
            if (numCurrentReplica + targets.length >= numExpectedReplica) {
              pendingReplications.add(block);
              NameNode.stateChangeLog.debug(
                "BLOCK* NameSystem.pendingTransfer: "
                + block.getBlockName()
                + " is removed from neededReplications to pendingReplications");
            }

            if (NameNode.stateChangeLog.isInfoEnabled()) {
              StringBuffer targetList = new StringBuffer("datanode(s)");
              for (int k = 0; k < targets.length; k++) {
                targetList.append(' ');
                targetList.append(targets[k].getName());
              }
              NameNode.stateChangeLog.info(
                      "BLOCK* NameSystem.pendingTransfer: " + "ask "
                      + srcNode.getName() + " to replicate "
                      + block.getBlockName() + " to " + targetList);
              NameNode.stateChangeLog.debug(
                  "BLOCK* neededReplications = " + neededReplications.size()
                  + " pendingReplications = " + pendingReplications.size() );
            }
          }

          //
          // Build returned objects from above lists
          //
          DatanodeDescriptor targetMatrix[][] =
                        new DatanodeDescriptor[replicateTargetSets.size()][];
          for (i = 0; i < targetMatrix.length; i++) {
            targetMatrix[i] = replicateTargetSets.get(i);
          }

          results = new Object[2];
          results[0] = replicateBlocks.toArray(new Block[replicateBlocks.size()]);
          results[1] = targetMatrix;
        }
      }
      return results;
    }
  }
 
    /** The class is responsible for choosing the desired number of targets
     * for placing block replicas.
     * The replica placement strategy is that if the writer is on a datanode,
     * the 1st replica is placed on the local machine,
     * otherwise a random datanode. The 2nd replica is placed on a datanode
     * that is on a different rack. The 3rd replica is placed on a datanode
     * which is on the same rack as the first replca.
     * @author hairong
     *
     */
    class Replicator {
      private class NotEnoughReplicasException extends Exception {
        NotEnoughReplicasException( String msg ) {
          super( msg );
        }
      }
     
      /**
       * choose <i>numOfReplicas</i> data nodes for <i>writer</i> to replicate
       * a block with size <i>blocksize</i>
       * If not, return as many as we can.
       *
       * @param numOfReplicas: number of replicas wanted.
       * @param writer: the writer's machine, null if not in the cluster.
       * @param excludedNodes: datanodesthat should not be considered targets.
       * @param blocksize: size of the data to be written.
       * @return array of DatanodeDescriptor instances chosen as targets
       * and sorted as a pipeline.
       */
      DatanodeDescriptor[] chooseTarget(int numOfReplicas,
          DatanodeDescriptor writer,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize ) {
        if( excludedNodes == null) {
          excludedNodes = new ArrayList<DatanodeDescriptor>();
        }
       
        return chooseTarget(numOfReplicas, writer,
            new ArrayList<DatanodeDescriptor>(), excludedNodes, blocksize);
      }
     
      /*
       *  re-replicate <i>numOfReplicas</i>
       /**
        * choose <i>numOfReplicas</i> data nodes for <i>writer</i>
        * to re-replicate a block with size <i>blocksize</i>
        * If not, return as many as we can.
        *
        * @param numOfReplicas: additional number of replicas wanted.
        * @param writer: the writer's machine, null if not in the cluster.
        * @param choosenNodes: datanodes that have been choosen as targets.
        * @param excludedNodes: datanodesthat should not be considered targets.
        * @param blocksize: size of the data to be written.
        * @return array of DatanodeDescriptor instances chosen as target
        * and sorted as a pipeline.
        */
      DatanodeDescriptor[] chooseTarget(int numOfReplicas,
          DatanodeDescriptor writer,
          List<DatanodeDescriptor> choosenNodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize ) {
        if( numOfReplicas == 0 )
          return new DatanodeDescriptor[0];
       
        if( excludedNodes == null) {
          excludedNodes = new ArrayList<DatanodeDescriptor>();
        }
       
        int clusterSize = clusterMap.getNumOfLeaves();
        int totalNumOfReplicas = choosenNodes.size()+numOfReplicas;
        if( totalNumOfReplicas > clusterSize) {
          numOfReplicas -= (totalNumOfReplicas-clusterSize);
          totalNumOfReplicas = clusterSize;
        }
       
        int maxNodesPerRack =
          (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;
       
        List<DatanodeDescriptor> results =
          new ArrayList<DatanodeDescriptor>(choosenNodes);
        excludedNodes.addAll(choosenNodes);
       
        if(!clusterMap.contains(writer))
          writer=null;
       
        DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer,
            clusterMap.getLeaves(NodeBase.ROOT),
            excludedNodes, blocksize, maxNodesPerRack, results );
       
        results.removeAll(choosenNodes);
       
        // sorting nodes to form a pipeline
        return getPipeline((writer==null)?localNode:writer, results);
      }
     
      /* choose <i>numOfReplicas</i> from <i>clusterNodes</i> */
      private DatanodeDescriptor chooseTarget(int numOfReplicas,
          DatanodeDescriptor writer,
          DatanodeDescriptor[] clusterNodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize,
          int maxNodesPerRack,
          List<DatanodeDescriptor> results) {
       
        if( numOfReplicas == 0 ) return writer;
       
        int numOfResults = results.size();
        if(writer == null && (numOfResults==1 || numOfResults==2) ) {
          writer = results.get(0);
        }
       
        try {
          switch( numOfResults ) {
          case 0:
            writer = chooseLocalNode(writer, clusterNodes, excludedNodes,
                blocksize, maxNodesPerRack, results);
            if(--numOfReplicas == 0) break;
          case 1:
            chooseRemoteRack(1, writer, clusterNodes, excludedNodes,
                blocksize, maxNodesPerRack, results);
            if(--numOfReplicas == 0) break;
          case 2:
            if(clusterMap.isOnSameRack(results.get(0), results.get(1))) {
              chooseRemoteRack(1, writer, clusterNodes, excludedNodes,
                  blocksize, maxNodesPerRack, results);
            } else {
              chooseLocalRack(writer, clusterNodes, excludedNodes,
                  blocksize, maxNodesPerRack, results);
            }
            if(--numOfReplicas == 0) break;
          default:
            chooseRandom(numOfReplicas, clusterNodes, excludedNodes,
                blocksize, maxNodesPerRack, results);
          }
        } catch (NotEnoughReplicasException e) {
          LOG.warn("Not be able to place enough replicas, still in need of "
              + numOfReplicas );
        }
        return writer;
      }
     
      /* choose <i>localMachine</i> as the target.
       * if <i>localMachine</i> is not availabe,
       * choose a node on the same rack
       * @return the choosen node
       */
      private DatanodeDescriptor chooseLocalNode(
          DatanodeDescriptor localMachine,
          DatanodeDescriptor[] nodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize,
          int maxNodesPerRack,
          List<DatanodeDescriptor> results)
      throws NotEnoughReplicasException {
        // if no local machine, randomly choose one node
        if(localMachine == null)
          return chooseRandom(nodes, excludedNodes,
              blocksize, maxNodesPerRack, results);
       
        // otherwise try local machine first
        if(!excludedNodes.contains(localMachine)) {
          excludedNodes.add(localMachine);
          if( isGoodTarget(localMachine, blocksize, maxNodesPerRack, results)) {
            results.add(localMachine);
            return localMachine;
          }
        }
       
        // try a node on local rack
        return chooseLocalRack(localMachine, nodes, excludedNodes,
            blocksize, maxNodesPerRack, results);
      }
     
      /* choose one node from the rack that <i>localMachine</i> is on.
       * if no such node is availabe, choose one node from the rack where
       * a second replica is on.
       * if still no such node is available, choose a random node
       * in the cluster <i>nodes</i>.
       * @return the choosen node
       */
      private DatanodeDescriptor chooseLocalRack(
          DatanodeDescriptor localMachine,
          DatanodeDescriptor[] nodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize,
          int maxNodesPerRack,
          List<DatanodeDescriptor> results)
      throws NotEnoughReplicasException {
        // no local machine, so choose a random machine
        if( localMachine == null ) {
          return chooseRandom(nodes, excludedNodes,
              blocksize, maxNodesPerRack, results );
        }
       
        // choose one from the local rack
        try {
          return chooseRandom(
              clusterMap.getLeaves( localMachine.getNetworkLocation() ),
              excludedNodes, blocksize, maxNodesPerRack, results);
        } catch (NotEnoughReplicasException e1) {
          // find the second replica
          DatanodeDescriptor newLocal=null;
          for(Iterator<DatanodeDescriptor> iter=results.iterator();
          iter.hasNext();) {
            DatanodeDescriptor nextNode = iter.next();
            if(nextNode != localMachine) {
              newLocal = nextNode;
              break;
            }
          }
          if( newLocal != null ) {
            try {
              return chooseRandom(
                  clusterMap.getLeaves( newLocal.getNetworkLocation() ),
                  excludedNodes, blocksize, maxNodesPerRack, results);
            } catch( NotEnoughReplicasException e2 ) {
              //otherwise randomly choose one from the network
              return chooseRandom(nodes, excludedNodes,
                  blocksize, maxNodesPerRack, results);
            }
          } else {
            //otherwise randomly choose one from the network
            return chooseRandom(nodes, excludedNodes,
                blocksize, maxNodesPerRack, results);
          }
        }
      }
     
      /* choose <i>numOfReplicas</i> nodes from the racks
       * that <i>localMachine</i> is NOT on.
       * if not enough nodes are availabe, choose the remaining ones
       * from the local rack
       */
     
      private void chooseRemoteRack( int numOfReplicas,
          DatanodeDescriptor localMachine,
          DatanodeDescriptor[] nodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize,
          int maxReplicasPerRack,
          List<DatanodeDescriptor> results)
      throws NotEnoughReplicasException {
        // get all the nodes on the local rack
        DatanodeDescriptor[] nodesOnRack = clusterMap.getLeaves(
            localMachine.getNetworkLocation() );
       
        // can we speed up this??? using hashing sets?
        DatanodeDescriptor[] nodesOnRemoteRack
        = new DatanodeDescriptor[nodes.length-nodesOnRack.length];
        HashSet<DatanodeDescriptor> set1 = new HashSet<DatanodeDescriptor>(nodes.length);
        HashSet<DatanodeDescriptor> set2 = new HashSet<DatanodeDescriptor>(nodesOnRack.length);
        for(int i=0; i<nodes.length; i++) {
          set1.add(nodes[i]);
        }
        for(int i=0; i<nodesOnRack.length; i++) {
          set2.add(nodesOnRack[i]);
        }
        set1.removeAll(set2);
        nodesOnRemoteRack = set1.toArray(nodesOnRemoteRack);
       
        int oldNumOfReplicas = results.size();
        // randomly choose one node from remote racks
        try {
          chooseRandom( numOfReplicas, nodesOnRemoteRack, excludedNodes,
              blocksize, maxReplicasPerRack, results );
        } catch (NotEnoughReplicasException e) {
          chooseRandom( numOfReplicas-(results.size()-oldNumOfReplicas),
              nodesOnRack, excludedNodes, blocksize,
              maxReplicasPerRack, results);
        }
      }
     
      /* Randomly choose one target from <i>nodes</i>.
       * @return the choosen node
       */
      private DatanodeDescriptor chooseRandom(
          DatanodeDescriptor[] nodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize,
          int maxNodesPerRack,
          List<DatanodeDescriptor> results)
      throws NotEnoughReplicasException {
        DatanodeDescriptor result;
        do {
          DatanodeDescriptor[] selectedNodes =
            chooseRandom(1, nodes, excludedNodes);
          if(selectedNodes.length == 0 ) {
            throw new NotEnoughReplicasException(
            "Not able to place enough replicas" );
          }
          result = (DatanodeDescriptor)(selectedNodes[0]);
        } while( !isGoodTarget( result, blocksize, maxNodesPerRack, results));
        results.add(result);
        return result;
      }
     
      /* Randomly choose <i>numOfReplicas</i> targets from <i>nodes</i>.
       */
      private void chooseRandom(int numOfReplicas,
          DatanodeDescriptor[] nodes,
          List<DatanodeDescriptor> excludedNodes,
          long blocksize,
          int maxNodesPerRack,
          List<DatanodeDescriptor> results)
      throws NotEnoughReplicasException {
        boolean toContinue = true;
        do {
          DatanodeDescriptor[] selectedNodes =
            chooseRandom(numOfReplicas, nodes, excludedNodes);
          if(selectedNodes.length < numOfReplicas) {
            toContinue = false;
          }
          for(int i=0; i<selectedNodes.length; i++) {
            DatanodeDescriptor result = (DatanodeDescriptor)(selectedNodes[i]);
            if( isGoodTarget( result, blocksize, maxNodesPerRack, results)) {
              numOfReplicas--;
              results.add(result);
            }
          } // end of for
        } while (numOfReplicas>0 && toContinue );
       
        if(numOfReplicas>0) {
          throw new NotEnoughReplicasException(
          "Not able to place enough replicas");
        }
      }
     
      /* Randomly choose one node from <i>nodes</i>.
       * @return the choosen node
       */
      private DatanodeDescriptor[] chooseRandom(int numOfReplicas,
          DatanodeDescriptor[] nodes,
          List<DatanodeDescriptor> excludedNodes) {
        List<DatanodeDescriptor> results =
          new ArrayList<DatanodeDescriptor>();
        int numOfAvailableNodes = 0;
        for(int i=0; i<nodes.length; i++) {
          if( !excludedNodes.contains(nodes[i]) ) {
            numOfAvailableNodes++;
          }
        }
        numOfReplicas = (numOfAvailableNodes<numOfReplicas)?
            numOfAvailableNodes:numOfReplicas;
        while( numOfReplicas > 0 ) {
          DatanodeDescriptor choosenNode = nodes[r.nextInt(nodes.length)];
          if(!excludedNodes.contains(choosenNode) &&
               !choosenNode.isDecommissionInProgress() &&
               !choosenNode.isDecommissioned()) {
            results.add( choosenNode );
            excludedNodes.add(choosenNode);
            numOfReplicas--;
          }
        }
        return (DatanodeDescriptor[])results.toArray(
            new DatanodeDescriptor[results.size()]);   
      }
     
      /* judge if a node is a good target.
       * return true if <i>node</i> has enough space,
       * does not have too much load, and the rack does not have too many nodes
       */
      private boolean isGoodTarget( DatanodeDescriptor node,
          long blockSize, int maxTargetPerLoc,
          List<DatanodeDescriptor> results) {
       
        // check if the node is (being) decommissed
        if(node.isDecommissionInProgress() || node.isDecommissioned()) {
          return false;
        }

        // check the remaining capacity of the target machine
        if(blockSize* FSConstants.MIN_BLOCKS_FOR_WRITE>node.getRemaining() ) {
          return false;
        }
       
        // check the communication traffic of the target machine
        double avgLoad = 0;
        int size = clusterMap.getNumOfLeaves();
        if( size != 0 ) {
          avgLoad = (double)totalLoad()/size;
        }
        if(node.getXceiverCount() > (2.0 * avgLoad)) {
          return false;
        }
       
        // check if the target rack has chosen too many nodes
        String rackname = node.getNetworkLocation();
        int counter=1;
        for( Iterator<DatanodeDescriptor> iter = results.iterator();
        iter.hasNext(); ) {
          DatanodeDescriptor result = iter.next();
          if(rackname.equals(result.getNetworkLocation())) {
            counter++;
          }
        }
        if(counter>maxTargetPerLoc) {
          return false;
        }
        return true;
      }
     
      /* Return a pipeline of nodes.
       * The pipeline is formed finding a shortest path that
       * starts from the writer and tranverses all <i>nodes</i>
       * This is basically a traveling salesman problem.
       */
      private DatanodeDescriptor[] getPipeline(
          DatanodeDescriptor writer,
          List<DatanodeDescriptor> nodes ) {
        int numOfNodes = nodes.size();
        DatanodeDescriptor[] results = new DatanodeDescriptor[numOfNodes];
        if( numOfNodes==0 ) return results;
       
        synchronized( clusterMap ) {
          int index=0;
          if(writer == null || !clusterMap.contains(writer)) {
            writer = nodes.get(0);
          }
          for( ;index<numOfNodes; index++ ) {
            DatanodeDescriptor shortestNode = null;
            int shortestDistance = Integer.MAX_VALUE;
            int shortestIndex = index;
            for( int i=index; i<numOfNodes; i++ ) {
              DatanodeDescriptor currentNode = nodes.get(i);
              int currentDistance = clusterMap.getDistance( writer, currentNode );
              if(shortestDistance>currentDistance ) {
                shortestDistance = currentDistance;
                shortestNode = currentNode;
                shortestIndex = i;
              }
            }
            //switch position index & shortestIndex
            if( index != shortestIndex ) {
              nodes.set(shortestIndex, nodes.get(index));
              nodes.set(index, shortestNode);
            }
            writer = shortestNode;
          }
        }
        return nodes.toArray( results );
      }
     
      /** Return datanodes that sorted by their distances to <i>reader</i>
       */
      DatanodeDescriptor[] sortByDistance(
          final DatanodeDescriptor reader,
          List<DatanodeDescriptor> nodes ) {
          synchronized(clusterMap) {
              if(reader != null && clusterMap.contains(reader)) {
                  java.util.Collections.sort(nodes, new Comparator<DatanodeDescriptor>() {
                      public int compare(DatanodeDescriptor n1, DatanodeDescriptor n2) {
                          return clusterMap.getDistance(reader, n1)
                          -clusterMap.getDistance(reader, n2);
                      }
                  });
              }
          }
          return (DatanodeDescriptor[])nodes.toArray(
                  new DatanodeDescriptor[nodes.size()]);
      }
     
    } //end of Replicator


    /**
     * Information about the file while it is being written to.
     * Note that at that time the file is not visible to the outside.
     *
     * This class contains a <code>Collection</code> of {@link Block}s that has
     * been written into the file so far, and file replication.
     *
     * @author shv
     */
    private class FileUnderConstruction {
      private short blockReplication; // file replication
      private long blockSize;
      private Collection<Block> blocks;
      private UTF8 clientName;         // lease holder
      private UTF8 clientMachine;
     
      FileUnderConstruction(short replication,
                            long blockSize,
                            UTF8 clientName,
                            UTF8 clientMachine) throws IOException {
        this.blockReplication = replication;
        this.blockSize = blockSize;
        this.blocks = new ArrayList<Block>();
        this.clientName = clientName;
        this.clientMachine = clientMachine;
      }
     
      public short getReplication() {
        return this.blockReplication;
      }
     
      public long getBlockSize() {
        return blockSize;
      }
     
      public Collection<Block> getBlocks() {
        return blocks;
      }
     
      public UTF8 getClientName() {
        return clientName;
      }
     
      public UTF8 getClientMachine() {
        return clientMachine;
      }
    }

    /**
     * Get data node by storage ID.
     *
     * @param nodeID
     * @return DatanodeDescriptor or null if the node is not found.
     * @throws IOException
     */
    public DatanodeDescriptor getDatanode( DatanodeID nodeID ) throws IOException {
      UnregisteredDatanodeException e = null;
      DatanodeDescriptor node = datanodeMap.get(nodeID.getStorageID());
      if (node == null)
        return null;
      if (!node.getName().equals(nodeID.getName())) {
        e = new UnregisteredDatanodeException( nodeID, node );
        NameNode.stateChangeLog.fatal("BLOCK* NameSystem.getDatanode: "
            + e.getLocalizedMessage() );
        throw e;
      }
      return node;
    }
   
    /**
     * Find data node by its name.
     *
     * This method is called when the node is registering.
     * Not performance critical.
     * Otherwise an additional tree-like structure will be required.
     *
     * @param name
     * @return DatanodeDescriptor if found or null otherwise
     * @throws IOException
     */
    public DatanodeDescriptor getDatanodeByName( String name ) throws IOException {
      for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext(); ) {
        DatanodeDescriptor node = it.next();
        if( node.getName().equals(name) )
           return node;
      }
      return null;
    }
   
    /* Find data node by its host name. */
    private DatanodeDescriptor getDatanodeByHost( String name ) {
        for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator();
        it.hasNext(); ) {
            DatanodeDescriptor node = it.next();
            if( node.getHost().equals(name) )
                return node;
        }
        return null;
    }
   
    /** Stop at and return the datanode at index (used for content browsing)*/
    private DatanodeInfo getDatanodeByIndex( int index ) {
      int i = 0;
      for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext(); ) {
        DatanodeInfo node = it.next();
        if( i == index )
           return node;
        i++;
      }
      return null;
    }
   
    public String randomDataNode() {
      int size = datanodeMap.size();
      int index = 0;
      if (size != 0) {
        index = r.nextInt(size);
        DatanodeInfo d = getDatanodeByIndex(index);
        if (d != null) {
          return d.getHost() + ":" + d.getInfoPort();
        }
      }
      return null;
    }
   
    public int getNameNodeInfoPort() {
      return infoPort;
    }

    /**
     * SafeModeInfo contains information related to the safe mode.
     * <p>
     * An instance of {@link SafeModeInfo} is created when the name node
     * enters safe mode.
     * <p>
     * During name node startup {@link SafeModeInfo} counts the number of
     * <em>safe blocks</em>, those that have at least the minimal number of
     * replicas, and calculates the ratio of safe blocks to the total number
     * of blocks in the system, which is the size of
     * {@link FSDirectory#activeBlocks}. When the ratio reaches the
     * {@link #threshold} it starts the {@link SafeModeMonitor} daemon in order
     * to monitor whether the safe mode extension is passed. Then it leaves safe
     * mode and destroys itself.
     * <p>
     * If safe mode is turned on manually then the number of safe blocks is
     * not tracked because the name node is not intended to leave safe mode
     * automatically in the case.
     *
     * @see ClientProtocol#setSafeMode(FSConstants.SafeModeAction)
     * @see SafeModeMonitor
     * @author Konstantin Shvachko
     */
    class SafeModeInfo {
      // configuration fields
      /** Safe mode threshold condition %.*/
      private double threshold;
      /** Safe mode extension after the threshold. */
      private int extension;
      /** Min replication required by safe mode. */
      private int safeReplication;
     
      // internal fields
      /** Time when threshold was reached.
       *
       * <br>-1 safe mode is off
       * <br> 0 safe mode is on, but threshold is not reached yet
       */
      private long reached = -1
      /** Total number of blocks. */
      int blockTotal;
      /** Number of safe blocks. */
      private int blockSafe;
     
      /**
       * Creates SafeModeInfo when the name node enters
       * automatic safe mode at startup.
       * 
       * @param conf configuration
       */
      SafeModeInfo( Configuration conf ) {
        this.threshold = conf.getFloat( "dfs.safemode.threshold.pct", 0.95f );
        this.extension = conf.getInt( "dfs.safemode.extension", 0 );
        this.safeReplication = conf.getInt( "dfs.replication.min", 1 );
        this.blockTotal = 0;
        this.blockSafe = 0;
      }

      /**
       * Creates SafeModeInfo when safe mode is entered manually.
       *
       * The {@link #threshold} is set to 1.5 so that it could never be reached.
       * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
       *
       * @see SafeModeInfo
       */
      private SafeModeInfo() {
        this.threshold = 1.5f// this threshold can never be riched
        this.extension = 0;
        this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
        this.blockTotal = -1;
        this.blockSafe = -1;
        this.reached = -1;
        enter();
      }
     
      /**
       * Check if safe mode is on.
       * @return true if in safe mode
       */
      synchronized boolean isOn() {
        try {
          isConsistent();   // SHV this is an assert
        } catch( IOException e ) {
          System.err.print( StringUtils.stringifyException( e ));
        }
        return this.reached >= 0;
      }
     
      /**
       * Enter safe mode.
       */
      void enter() {
        if( reached != 0 )
          NameNode.stateChangeLog.info(
            "STATE* SafeModeInfo.enter: " + "Safe mode is ON.\n"
            + getTurnOffTip() );
        this.reached = 0;
      }
     
      /**
       * Leave safe mode.
       */
      synchronized void leave() {
        if( reached >= 0 )
          NameNode.stateChangeLog.info(
            "STATE* SafeModeInfo.leave: " + "Safe mode is OFF." );
        reached = -1;
        safeMode = null;
        NameNode.stateChangeLog.info("STATE* Network topology has "
                +clusterMap.getNumOfRacks()+" racks and "
                +clusterMap.getNumOfLeaves()+ " datanodes");
      }
     
      /**
       * Safe mode can be turned off iff
       * the threshold is reached and
       * the extension time have passed.
       * @return true if can leave or false otherwise.
       */
      synchronized boolean canLeave() {
        if( reached == 0 )
          return false;
        if( now() - reached < extension )
          return false;
        return ! needEnter();
      }
     
      /**
       * There is no need to enter safe mode
       * if DFS is empty or {@link #threshold} == 0
       */
      boolean needEnter() {
        return getSafeBlockRatio() < threshold;
      }
     
      /**
       * Ratio of the number of safe blocks to the total number of blocks
       * to be compared with the threshold.
       */
      private float getSafeBlockRatio() {
        return ( blockTotal == 0 ? 1 : (float)blockSafe/blockTotal );
      }
     
      /**
       * Check and trigger safe mode if needed.
       */
      private void checkMode() {
        if( needEnter() ) {
          enter();
          return;
        }
        // the threshold is reached
        if( ! isOn() ||                           // safe mode is off
            extension <= 0 || threshold <= 0 ) {  // don't need to wait
          this.leave();                           // just leave safe mode
          return;
        }
        if( reached > 0 // threshold has already been reached before
          return;
        // start monitor
        reached = now();
        smmthread = new Daemon(new SafeModeMonitor());
        smmthread.start();
      }
     
      /**
       * Set total number of blocks.
       */
      synchronized void setBlockTotal( int total) {
        this.blockTotal = total;
        checkMode();
      }
     
      /**
       * Increment number of safe blocks if current block has
       * reached minimal replication.
       * @param replication current replication
       */
      synchronized void incrementSafeBlockCount( short replication ) {
        if( (int)replication == safeReplication )
          this.blockSafe++;
        checkMode();
      }
     
      /**
       * Decrement number of safe blocks if current block has
       * fallen below minimal replication.
       * @param replication current replication
       */
      synchronized void decrementSafeBlockCount( short replication ) {
        if( replication == safeReplication-1 )
          this.blockSafe--;
        checkMode();
      }
     
      /**
       * Check if safe mode was entered manually or at startup.
       */
      boolean isManual() {
        return blockTotal == -1;
      }
     
      /**
       * A tip on how safe mode is to be turned off: manually or automatically.
       */
      String getTurnOffTip() {
        return ( isManual() ?
            "Use \"hadoop dfs -safemode leave\" to turn safe mode off." :
            "Safe mode will be turned off automatically." );
      }
     
      /**
       * Returns printable state of the class.
       */
      public String toString() {
        String resText = "Current safe block ratio = "
          + getSafeBlockRatio()
          + ". Target threshold = " + threshold
          + ". Minimal replication = " + safeReplication + ".";
        if( reached > 0 )
          resText += " Threshold was reached " + new Date(reached) + ".";
        return resText;
      }
     
      /**
       * Checks consistency of the class state.
       */
      void isConsistent() throws IOException {
        if( blockTotal == -1 && blockSafe == -1 ) {
          return; // manual safe mode
        }
        int activeBlocks = dir.activeBlocks.size();
        if( blockTotal != activeBlocks )
          throw new IOException( "blockTotal " + blockTotal
              + " does not match all blocks count. "
              + "activeBlocks = " + activeBlocks
              + ". safeBlocks = " + blockSafe
              + " safeMode is: "
              + ((safeMode == null) ? "null" : safeMode.toString()) );
        if( blockSafe < 0 || blockSafe > blockTotal )
          throw new IOException( "blockSafe " + blockSafe
              + " is out of range [0," + blockTotal + "]. "
              + "activeBlocks = " + activeBlocks
              + " safeMode is: "
              + ((safeMode == null) ? "null" : safeMode.toString()) );
      }
    }
   
    /**
     * Periodically check whether it is time to leave safe mode.
     * This thread starts when the threshold level is reached.
     *
     * @author Konstantin Shvachko
     */
    class SafeModeMonitor implements Runnable {
      /** interval in msec for checking safe mode: {@value} */
      private static final long recheckInterval = 1000;
     
      /**
       */
      public void run() {
        while( ! safeMode.canLeave() ) {
          try {
            Thread.sleep(recheckInterval);
          } catch (InterruptedException ie) {
          }
        }
        // leave safe mode an stop the monitor
        safeMode.leave();
        smmthread = null;
      }
    }
   
    /**
     * Current system time.
     * @return current time in msec.
     */
    static long now() {
      return System.currentTimeMillis();
    }
   
    /**
     * Check whether the name node is in safe mode.
     * @return true if safe mode is ON, false otherwise
     */
    boolean isInSafeMode() {
      if( safeMode == null )
        return false;
      return safeMode.isOn();
    }
   
    /**
     * Increment number of blocks that reached minimal replication.
     * @param replication current replication
     */
    void incrementSafeBlockCount( int replication ) {
      if( safeMode == null )
        return;
      safeMode.incrementSafeBlockCount( (short)replication );
    }

    /**
     * Decrement number of blocks that reached minimal replication.
     * @param replication current replication
     */
    void decrementSafeBlockCount( int replication ) {
      if( safeMode == null )
        return;
      safeMode.decrementSafeBlockCount( (short)replication );
    }

    /**
     * Set the total number of blocks in the system.
     */
    void setBlockTotal() {
      if( safeMode == null )
        return;
      safeMode.setBlockTotal( dir.activeBlocks.size() );
    }

    /**
     * Enter safe mode manually.
     * @throws IOException
     */
    synchronized void enterSafeMode() throws IOException {
      if( isInSafeMode() ) {
        NameNode.stateChangeLog.info(
            "STATE* FSNamesystem.enterSafeMode: " + "Safe mode is already ON.");
        return;
      }
      safeMode = new SafeModeInfo();
    }
   
    /**
     * Leave safe mode.
     * @throws IOException
     */
    synchronized void leaveSafeMode() throws IOException {
      if( ! isInSafeMode() ) {
        NameNode.stateChangeLog.info(
            "STATE* FSNamesystem.leaveSafeMode: " + "Safe mode is already OFF.");
        return;
      }
      safeMode.leave();
    }
   
    String getSafeModeTip() {
      if( ! isInSafeMode() )
        return "";
      return safeMode.getTurnOffTip();
    }

    long getEditLogSize() throws IOException {
      return getEditLog().getEditLogSize();
    }

    synchronized void rollEditLog() throws IOException {
      if (isInSafeMode()) {
        throw new SafeModeException("Checkpoint not created",
                                     safeMode);
      }
      LOG.info("Roll Edit Log");
      getEditLog().rollEditLog();
    }

    synchronized void rollFSImage() throws IOException {
      LOG.info("Roll FSImage");
      if (isInSafeMode()) {
        throw new SafeModeException("Checkpoint not created",
                                    safeMode);
      }
      dir.fsImage.rollFSImage();
    }

    File getFsImageName() throws IOException {
      return dir.fsImage.getFsImageName();
    }

    File[] getFsImageNameCheckpoint() throws IOException {
      return dir.fsImage.getFsImageNameCheckpoint();
    }

    File getFsEditName() throws IOException {
      return getEditLog().getFsEditName();
    }
   
    /**
     * This class is used in Namesystem's jetty to do fsck on namenode
     * @author Milind Bhandarkar
     */
    public static class FsckServlet extends HttpServlet {
      public void doGet(HttpServletRequest request,
          HttpServletResponse response
          ) throws ServletException, IOException {
        Map<String,String[]> pmap = request.getParameterMap();
        try {
          ServletContext context = getServletContext();
          NameNode nn = (NameNode) context.getAttribute("name.node");
          Configuration conf = (Configuration) context.getAttribute("name.conf");
          NamenodeFsck fscker = new NamenodeFsck(conf, nn, pmap, response);
          fscker.fsck();
        } catch (IOException ie) {
          StringUtils.stringifyException(ie);
          LOG.warn(ie);
          String errMsg = "Fsck on path " + pmap.get("path") + " failed.";
          response.sendError(HttpServletResponse.SC_GONE, errMsg);
          throw ie;
        }
      }
    }

    /**
     * This class is used in Namesystem's jetty to retrieve a file.
     * Typically used by the Secondary NameNode to retrieve image and
     * edit file for periodic checkpointing.
     * @author Dhruba Borthakur
     */
    public static class GetImageServlet extends HttpServlet {
      public void doGet(HttpServletRequest request,
          HttpServletResponse response
          ) throws ServletException, IOException {
        Map<String,String[]> pmap = request.getParameterMap();
        try {
          ServletContext context = getServletContext();
          NameNode nn = (NameNode) context.getAttribute("name.node");
          Configuration conf = (Configuration) context.getAttribute("name.conf");
          TransferFsImage ff = new TransferFsImage(pmap, request, response);
          if (ff.getImage()) {
            // send fsImage to Secondary
            TransferFsImage.getFileServer(response.getOutputStream(),
                                          nn.getFsImageName());
          } else if (ff.getEdit()) {
            // send old edits to Secondary
            TransferFsImage.getFileServer(response.getOutputStream(),
                                          nn.getFsEditName());
          } else if (ff.putImage()) {
            // issue a HTTP get request to download the new fsimage
            TransferFsImage.getFileClient(ff.getInfoServer(), "getimage=1",
                                          nn.getFsImageNameCheckpoint());
          }
        } catch (IOException ie) {
          StringUtils.stringifyException(ie);
          LOG.warn(ie);
          String errMsg = "GetImage failed.";
          response.sendError(HttpServletResponse.SC_GONE, errMsg);
          throw ie;
        }
      }
    }
}
TOP

Related Classes of org.apache.hadoop.dfs.FSNamesystem$Replicator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.