Package org.apache.hadoop.hdfs.server.datanode

Source Code of org.apache.hadoop.hdfs.server.datanode.FSDataset$NamespaceSlice

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;

import java.nio.channels.FileChannel;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.StandardMBean;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.DF;
import org.apache.hadoop.fs.DU;
import org.apache.hadoop.fs.DU.NamespaceSliceDU;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.datanode.BlockInlineChecksumReader.GenStampAndChecksum;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DirectoryScanner.ScanDifference;
import org.apache.hadoop.hdfs.server.datanode.NamespaceMap.BlockBucket;
import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
import org.apache.hadoop.hdfs.server.protocol.BlockFlags;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;


/**************************************************
* FSDataset manages a set of data blocks.  Each block
* has a unique name and an extent on disk.
*
***************************************************/
public class FSDataset implements FSConstants, FSDatasetInterface {
 
  public static final Log LOG = LogFactory.getLog(FSDataset.class);
 
  interface FSDatasetDeltaInterface {
    void addBlock(int namespaceId, Block block);

    void removeBlock(int namespaceId, Block block);

    void updateBlock(int namespaceId, Block oldBlock, Block newBlock);
  }
 
  static String[] getFileNames(File[] files) {
    String[] fileNames = new String[files.length];
    for (int i = 0; i < files.length; i++) {
      fileNames[i] = files[i].getName();
    }
    return fileNames;
  }

  static Block getBlockFromNames(File blockFiles[], String[] blockFilesNames, int index)
      throws IOException {
    if (Block.isSeparateChecksumBlockFilename(blockFilesNames[index])) {
      long genStamp = BlockWithChecksumFileReader
          .getGenerationStampFromSeperateChecksumFile(blockFilesNames,
              blockFilesNames[index]);
      return new Block(blockFiles[index], blockFiles[index].length(),
          genStamp);
    } else if (Block.isInlineChecksumBlockFilename(blockFilesNames[index])) {
      // TODO: We might want to optimize it.
      GenStampAndChecksum sac = BlockInlineChecksumReader
          .getGenStampAndChecksumFromInlineChecksumFile(blockFilesNames[index]);
      long blockLengh = BlockInlineChecksumReader.getBlockSizeFromFileLength(
          blockFiles[index].length(), sac.checksumType, sac.bytesPerChecksum);

      return new Block(blockFiles[index], blockLengh, sac.generationStamp);
    }
    return null;
  }

  /**
   * A NamespaceSlice represents a portion of a namespace stored on a volume. 
   * Taken together, all BNamespaceSlices sharing a namespaceID across a
   * cluster represent a single namespace.
   */
  class NamespaceSlice {
    private final int namespaceId;
    private final FSVolume volume; // volume to which this namespaceSlice belongs to
    private final FSDir dataDir; // StorageDirectory/current/nsid/current/finalized
    private final File detachDir; // directory store Finalized replica
    private final File rbwDir ; // directory store RBW replica
    private final File tmpDir; // directory store Temporary replica
    private final NamespaceSliceDU dfsUsage;
    private volatile boolean blockCrcFileLoaded;

    /**
     *
     * @param namespaceId
     * @param volume {@link FSVolume} to which this NamespaceSlice belongs to
     * @param nsDir directory corresponding to the NameSpaceSlice
     * @param conf
     * @throws IOException
     */
    NamespaceSlice(int namespaceId, FSVolume volume, File nsDir, Configuration conf, boolean supportAppends)
        throws IOException {
      this.namespaceId = namespaceId;
      this.volume = volume;
      File nsDirCur = new File(nsDir, DataStorage.STORAGE_DIR_CURRENT);
      File dataDirFile = new File(nsDirCur, DataStorage.STORAGE_DIR_FINALIZED);
      this.dataDir = new FSDir(namespaceId, dataDirFile, volume);
           
      this.detachDir = new File(nsDir, "detach");
      if (detachDir.exists()) {
        recoverDetachedBlocks(dataDirFile, detachDir);
      }

      // Files that were being written when the datanode was last shutdown
      // are now moved back to the data directory. It is possible that
      // in the future, we might want to do some sort of datanode-local
      // recovery for these blocks. For example, crc validation.
      //
      this.tmpDir = new File(nsDir, "tmp");
      if (tmpDir.exists()) {       
        // rename tmpDir to prepare delete
        File toDeleteDir = new File(tmpDir.getParent(),
        DELETE_FILE_EXT + tmpDir.getName());
        if (tmpDir.renameTo(toDeleteDir)) {
          // asyncly delete the renamed directory
          asyncDiskService.deleteAsyncFile(volume, toDeleteDir);
        } else {
          // rename failed, let's synchronously delete the directory
          FileUtil.fullyDelete(tmpDir);
          DataNode.LOG.warn("Deleted " + tmpDir.getPath());
        }
      }
     
      this.rbwDir = new File(nsDirCur, DataStorage.STORAGE_DIR_RBW);
      // Files that were being written when the datanode was last shutdown
      // should not be deleted if append mode is enabled.
      if (rbwDir.exists()) {
        recoverBlocksBeingWritten(rbwDir);
      }
     
      if (!rbwDir.mkdirs()) {
        if (!rbwDir.isDirectory()) {
          throw new IOException("Mkdirs failed to create " + rbwDir.toString());
        }
      }
      if (!tmpDir.mkdirs()) {
        if (!tmpDir.isDirectory()) {
          throw new IOException("Mkdirs failed to create " + tmpDir.toString());
        }
      }
      if (!detachDir.mkdirs()) {
        if (!detachDir.isDirectory()) {
          throw new IOException("Mkdirs failed to create " + detachDir.toString());
        }
      }
      this.dfsUsage = volume.dfsUsage.addNamespace(namespaceId, nsDir, conf);
      this.blockCrcFileLoaded = false;
    }
   
    void getBlockInfo(LightWeightHashSet<Block> blocks) throws IOException{
      dataDir.getBlockInfo(blocks);
    }
   
   boolean isBlockCrcFileLoaded() {
     return blockCrcFileLoaded;
   }

   void setBlockCrcFileLoaded(boolean blockCrcFileLoaded) {
     this.blockCrcFileLoaded = blockCrcFileLoaded;
   }
   
    /**
     * Recover detached files on datanode restart. If a detached block
     * does not exist in the original directory, then it is moved to the
     * original directory.
     */
    private void recoverDetachedBlocks(File dataDir, File dir)
                                           throws IOException {
      File contents[] = dir.listFiles();

      if (contents == null) {
        return;
      }
      for (int i = 0; i < contents.length; i++) {
        if (!contents[i].isFile()) {
          throw new IOException ("Found " + contents[i] + " in " + dir +
                                 " but it is not a file.");
        }

        //
        // If the original block file still exists, then no recovery
        // is needed.
        //
        File blk = new File(dataDir, contents[i].getName());
        if (!blk.exists()) {
          if (!contents[i].renameTo(blk)) {
            throw new IOException("Unable to recover detached file " +
                                  contents[i]);
          }
          continue;
        }
        if (!contents[i].delete()) {
            throw new IOException("Unable to cleanup detached file " +
                                  contents[i]);
        }
      }
    }
   
    void getBlocksBeingWrittenInfo(LightWeightHashSet<Block> blockSet) throws IOException {
      if (rbwDir == null) {
        return;
      }
      File[] blockFiles = rbwDir.listFiles();
      if (blockFiles == null) {
        return;
      }
      String[] blockFileNames = getFileNames(blockFiles)
      for (int i = 0; i < blockFiles.length; i++) {
        if (!blockFiles[i].isDirectory()) {
        // get each block in the rbwDir directory
          Block block = FSDataset.getBlockFromNames(blockFiles, blockFileNames, i);
          if (block != null) {
            // add this block to block set
            blockSet.add(block);
            if (DataNode.LOG.isDebugEnabled()) {
              DataNode.LOG.debug("recoverBlocksBeingWritten for block " + block);
            }           
          }
        }
      }
    }
   
    /**
     * Recover blocks that were being written when the datanode
     * was earlier shut down. These blocks get re-inserted into
     * ongoingCreates. Also, send a blockreceived message to the NN
     * for each of these blocks because these are not part of a
     * block report.
     */
    private void recoverBlocksBeingWritten(File bbw) throws IOException {
      FSDir fsd = new FSDir(namespaceId, bbw, this.volume);
      LightWeightHashSet<BlockAndFile> blockSet = new LightWeightHashSet<BlockAndFile>();
      fsd.getBlockAndFileInfo(blockSet);
      for (BlockAndFile b : blockSet) {
        File f = b.pathfile;  // full path name of block file
        lock.writeLock().lock();
        try {
          boolean isInlineChecksum = Block.isInlineChecksumBlockFilename(f
              .getName());
          int checksumType = DataChecksum.CHECKSUM_UNKNOWN;
          int bytesPerChecksum = -1;
          if (isInlineChecksum) {
            GenStampAndChecksum sac = BlockInlineChecksumReader
                .getGenStampAndChecksumFromInlineChecksumFile(f.getName());
            checksumType = sac.checksumType;
            bytesPerChecksum = sac.bytesPerChecksum;
          }
          DatanodeBlockInfo binfo = new DatanodeBlockInfo(volume, f,
              DatanodeBlockInfo.UNFINALIZED, true, isInlineChecksum,
              checksumType, bytesPerChecksum, false, 0);

          volumeMap.add(namespaceId, b.block, binfo);
          volumeMap.addOngoingCreates(namespaceId, b.block, new ActiveFile(
              binfo, true, ActiveFile.UNKNOWN_SIZE, false));
        } finally {
          lock.writeLock().unlock();
        }
        if (DataNode.LOG.isDebugEnabled()) {
          DataNode.LOG.debug("recoverBlocksBeingWritten for block " + b.block + "namespaceId: "+namespaceId);
        }
      }
    }

    File getDirectory() {
      return dataDir.getDirectory().getParentFile();
    }
   
    File getCurrentDir() {
      return dataDir.getDirectory();
    }
   
    File getRbwDir() {
      return rbwDir;
    }
   
    void decDfsUsed(long value) {
      dfsUsage.decDfsUsed(value);
    }
   
    long getDfsUsed() throws IOException {
      return dfsUsage.getUsed();
    }
   
    /**
     * Temporary files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createTmpFile(Block b) throws IOException {
      File f = new File(tmpDir, b.getBlockName());
      return FSDataset.createTmpFile(b, f);
    }
   
    File createDetachFile(Block b) throws IOException {
      File f = new File(detachDir, b.getBlockName());
      return FSDataset.createTmpFile(b, f);
    }
   
    File getTmpFile(Block b) throws IOException {
      File f = new File(tmpDir, b.getBlockName());
      return f;
    }
   
    /**
     * Temporary files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createTmpFile(Block b, boolean replicationRequest,
        boolean inlineChecksum, int checksumType, int bytesPerChecksum)
        throws IOException {
      File f= null;
      String fileName;
      if (inlineChecksum) {
        fileName = BlockInlineChecksumWriter.getInlineChecksumFileName(b,
            checksumType, bytesPerChecksum);
      } else {
        fileName = b.getBlockName();
      }
      if (!replicationRequest) {
        f = new File(rbwDir, fileName);
      } else {
        f = new File(tmpDir, fileName);
      }
      return FSDataset.createTmpFile(b, f);
    }
   

    /**
     * RBW files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createRbwFile(Block b) throws IOException {
      File f = new File(rbwDir, b.getBlockName());
      return FSDataset.createTmpFile(b, f);
    }

    File addBlock(Block b, File f, boolean inlineChecksum, int checksumType,
        int bytesPerChecksum) throws IOException {
      File blockFile = dataDir.addBlock(namespaceId, b, f, inlineChecksum,
          checksumType, bytesPerChecksum);
      long spaceAdded;
      if (!inlineChecksum) {
        File metaFile = BlockWithChecksumFileWriter.getMetaFile(blockFile , b);
        spaceAdded = b.getNumBytes() + metaFile.length();
      } else {
        spaceAdded = blockFile.length();
      }
      dfsUsage.incDfsUsed(spaceAdded);
      return blockFile;
    }
     
    void checkDirs() throws DiskErrorException {
      dataDir.checkDirTree();
      DiskChecker.checkDir(tmpDir);
      DiskChecker.checkDir(detachDir);
      DiskChecker.checkDir(rbwDir);
    }
     
    void clearPath(File f) {
      dataDir.clearPath(f);
    }
     
    public String toString() {
      return dataDir.getDirectory().getAbsolutePath();
    }
   
    public void shutdown() {
      volume.dfsUsage.removeNamespace(namespaceId);
    }
  }

  /**
   * A data structure than encapsulates a Block along with the full pathname
   * of the block file
   */
  static class BlockAndFile implements Comparable<BlockAndFile> {
    final Block block;
    final File pathfile;

    BlockAndFile(File fullpathname, Block block) {
      this.pathfile = fullpathname;
      this.block = block;
    }

    public int compareTo(BlockAndFile o)
    {
      return this.block.compareTo(o.block);
    }
  }

  /**
   * A node type that can be built into a tree reflecting the
   * hierarchy of blocks on the local disk.
   */
  class FSDir {
    File dir;
    int numBlocks = 0;
    volatile FSDir childrenDirs[];
    int lastChildIdx = 0;
   
    File getDirectory(){
      return dir;
    }
   
    FSDir[] getChildren() {
      return childrenDirs;
    }
     
    public FSDir() {
    }
   
    public FSDir(int namespaceId, File dir) throws IOException{
      this(namespaceId, dir, null);
    }
   
    public FSDir(int namespaceId, File dir, FSVolume volume) throws IOException {
      this.dir = dir;
      this.childrenDirs = null;     
      if (!dir.exists()) {
        if (!dir.mkdirs()) {
          throw new IOException("Mkdirs failed to create " +
                                dir.toString());
        }
      } else {
        File[] files = dir.listFiles();
        String[] filesNames = getFileNames(files);
        int numChildren = 0;
        for (int i = 0; i < files.length; i++) {
          File file = files[i];
          String fileName = filesNames[i];
          if (isPendingDeleteFilename(fileName)){
            // Should not cause throwing an exception.   
            // Obsolete files are not included in the block report.  
            asyncDiskService.deleteAsyncFile(volume, file);  
          } else if (file.isDirectory()) {
            numChildren++;
          } else if (Block.isSeparateChecksumBlockFilename(fileName)) {
            numBlocks++;
            if (volume != null) {  
              long blkSize = file.length();
              long genStamp = BlockWithChecksumFileReader
                  .getGenerationStampFromSeperateChecksumFile(filesNames,
                      fileName);
              volumeMap.add(namespaceId, new Block(file, blkSize, genStamp),
                  new DatanodeBlockInfo(volume, file, blkSize, true, false,
                      DataChecksum.CHECKSUM_UNKNOWN, -1, false, 0));
            }
          } else if (Block.isInlineChecksumBlockFilename(fileName)) {
            numBlocks++;
            if (volume != null) {
              GenStampAndChecksum sac = BlockInlineChecksumReader
                      .getGenStampAndChecksumFromInlineChecksumFile(fileName);
              long blkSize = BlockInlineChecksumReader
                  .getBlockSizeFromFileLength(file.length(), sac.checksumType,
                      sac.bytesPerChecksum);
              volumeMap.add(namespaceId, new Block(file, blkSize,
                  sac.generationStamp), new DatanodeBlockInfo(volume, file,
                  blkSize, true, true, sac.checksumType, sac.bytesPerChecksum,
                  false, 0));
            }
          }
        }
        if (numChildren > 0) {
          FSDir[] newChildren = new FSDir[numChildren];
          int curdir = 0;
          for (int idx = 0; idx < files.length; idx++) {
            String fileName = files[idx].getName();
            if (files[idx].isDirectory() && !isPendingDeleteFilename(fileName)) {
              newChildren[curdir] = new FSDir(namespaceId, files[idx], volume);
              curdir++;
            }
          }
          childrenDirs = newChildren;
        }
      }
    }
       
    public File addBlock(int namespaceId, Block b, File src,
        boolean inlineChecksum, int checksumType, int bytesPerChecksum)
        throws IOException {
      //First try without creating subdirectories
      File file = addBlock(namespaceId, b, src, false, false, inlineChecksum,
          checksumType, bytesPerChecksum);
      return (file != null) ? file : addBlock(namespaceId, b, src, true, true,
          inlineChecksum, checksumType, bytesPerChecksum);
    }

    private File addBlock(int namespaceId, Block b, File src, boolean createOk,
        boolean resetIdx, boolean inlineChecksum, int checksumType,
        int bytesPerChecksum) throws IOException {
      if (numBlocks < maxBlocksPerDir) {
        File dest;
        if (!inlineChecksum) {
          dest = new File(dir, b.getBlockName());
          File metaData = BlockWithChecksumFileWriter.getMetaFile( src, b );
          File newmeta = BlockWithChecksumFileWriter.getMetaFile(dest, b);
          if ( ! metaData.renameTo( newmeta )) {
            throw new IOException("could not move file "
                + metaData.getAbsolutePath() + " to "
                + newmeta.getAbsolutePath());
          }
          if (DataNode.LOG.isDebugEnabled()) {
            DataNode.LOG.debug("addBlock: Moved " + metaData + " to " + newmeta);
          }
        } else {
          dest = new File(dir,
              BlockInlineChecksumWriter.getInlineChecksumFileName(b,
                  checksumType, bytesPerChecksum));
        }
        if (! src.renameTo( dest ) ) {
          throw new IOException( "could not move files for " + b +
                                 " from tmp to " +
                                 dest.getAbsolutePath() );
        }
        // fsyncIfPossible parent directory to persist rename.
        if (datanode.syncOnClose) {
          NativeIO.fsyncIfPossible(dest.getParent());
        }
        if (DataNode.LOG.isDebugEnabled()) {
          DataNode.LOG.debug("addBlock: Moved " + src + " to " + dest);
        }

        numBlocks += 1;
        return dest;
      }
      
      FSDir[] children = this.getChildren();
      if (lastChildIdx < 0 && resetIdx) {
        //reset so that all children will be checked
        lastChildIdx = random.nextInt(children.length);             
      }
           
      if (lastChildIdx >= 0 && children != null) {
        //Check if any child-tree has room for a block.
        for (int i=0; i < children.length; i++) {
          int idx = (lastChildIdx + i)%children.length;
          File file = children[idx].addBlock(namespaceId, b, src, false,
              resetIdx, inlineChecksum, checksumType, bytesPerChecksum);
          if (file != null) {
            lastChildIdx = idx;
            return file;
          }
        }
        lastChildIdx = -1;
      }
           
      if (!createOk) {
        return null;
      }
           
      if (children == null || children.length == 0) {
        // make sure children is immutable once initialized.
        FSDir[] newChildren = new FSDir[maxBlocksPerDir];
        for (int idx = 0; idx < maxBlocksPerDir; idx++) {
          newChildren[idx] = new FSDir(namespaceId, new File(dir,
              DataStorage.BLOCK_SUBDIR_PREFIX + idx));
        }
        childrenDirs = children = newChildren;
      }
           
      //now pick a child randomly for creating a new set of subdirs.
      lastChildIdx = random.nextInt(children.length);
      return children[lastChildIdx].addBlock(namespaceId, b, src, true, false,
          inlineChecksum, checksumType, bytesPerChecksum);
    }
   
    /**
     * Populate the given blockSet with any child blocks
     * found at this node.
     * @throws IOException
     */
    public void getBlockInfo(LightWeightHashSet<Block> blockSet) throws IOException {
      FSDir[] children = this.getChildren();
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          children[i].getBlockInfo(blockSet);
        }
      }

      File blockFiles[] = dir.listFiles();
      String[] blockFilesNames = getFileNames(blockFiles);
     
      for (int i = 0; i < blockFiles.length; i++) {
        Block block = getBlockFromNames(blockFiles, blockFilesNames, i);
        if (block != null) {
          blockSet.add(block);
        }
      }
    }

    /**
     * Populate the given blockSet with any child blocks
     * found at this node. With each block, return the full path
     * of the block file.
     * @throws IOException
     */
    void getBlockAndFileInfo(LightWeightHashSet<BlockAndFile> blockSet) throws IOException {
      FSDir[] children = this.getChildren();
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          children[i].getBlockAndFileInfo(blockSet);
        }
      }

      File blockFiles[] = dir.listFiles();
      String[] blockFilesNames = getFileNames(blockFiles);     
      for (int i = 0; i < blockFiles.length; i++) {
        Block block = getBlockFromNames(blockFiles, blockFilesNames, i);
        if (block != null) {
          blockSet.add(new BlockAndFile(blockFiles[i].getAbsoluteFile(), block));
        }
      }
    }

    /**
     * check if a data directory is healthy
     * @throws DiskErrorException
     */
    public void checkDirTree() throws DiskErrorException {
      DiskChecker.checkDir(dir);

      FSDir[] children = this.getChildren();
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          children[i].checkDirTree();
        }
      }
    }

    void clearPath(File f) {
      String root = dir.getAbsolutePath();
      String dir = f.getAbsolutePath();
      if (dir.startsWith(root)) {
        String[] dirNames = dir.substring(root.length()).
          split(File.separator + "subdir");
        if (clearPath(f, dirNames, 1))
          return;
      }
      clearPath(f, null, -1);
    }

    /*
     * dirNames is an array of string integers derived from
     * usual directory structure data/subdirN/subdirXY/subdirM ...
     * If dirName array is non-null, we only check the child at
     * the children[dirNames[idx]]. This avoids iterating over
     * children in common case. If directory structure changes
     * in later versions, we need to revisit this.
     */
    private boolean clearPath(File f, String[] dirNames, int idx) {
      if ((dirNames == null || idx == dirNames.length) &&
          dir.compareTo(f) == 0) {
        numBlocks--;
        return true;
      }

      FSDir[] children = this.getChildren();
      if (dirNames != null) {
        //guess the child index from the directory name
        if (idx > (dirNames.length - 1) || children == null) {
          return false;
        }
        int childIdx;
        try {
          childIdx = Integer.parseInt(dirNames[idx]);
        } catch (NumberFormatException ignored) {
          // layout changed? we could print a warning.
          return false;
        }
        return (childIdx >= 0 && childIdx < children.length) ?
          children[childIdx].clearPath(f, dirNames, idx+1) : false;
      }

      //guesses failed. back to blind iteration.
      if (children != null) {
        for(int i=0; i < children.length; i++) {
          if (children[i].clearPath(f, null, -1)){
            return true;
          }
        }
      }
      return false;
    }
   
    public String toString() {
      FSDir[] children = this.getChildren();
      return "FSDir{" +
        "dir=" + dir +
        ", children=" + (children == null ? null : Arrays.asList(children)) +
        "}";
    }
  }
 
  /**
   * A map from namespace ID to NamespaceSlice object
   *
   * Only three operations are supported: add a namespace, remove a namespace
   * and get a snapshot of the list of the namespace map, which is an immutable
   * object.
   *
   * No extra locking is allowed in this object
   */
  class NamespaceMap {
    /**
     * Any object referred here needs to be immutable. Every time this map is
     * updated, a new map is created and the reference here is changed to the
     * new map.
     */
    private Map<Integer, NamespaceSlice> namespaceMap = new HashMap<Integer, NamespaceSlice>();;

    /**
     * It is the only method a caller is supposed to access namespaceMap. This
     * method will return a immutable map. It is a snapshot.
     *
     * @return
     */
    private synchronized Map<Integer, NamespaceSlice> getNamespaceMapSnapshot() {
      return namespaceMap;
    }

    public synchronized void addNamespace(int namespaceId, NamespaceSlice ns)
        throws IOException {
      // add a new name-space by copying all the entries to a new map.
      Map<Integer, NamespaceSlice> newMap = new HashMap<Integer, NamespaceSlice>(
          namespaceMap);
      newMap.put(namespaceId, ns);
      namespaceMap = newMap;
    }

    public synchronized void removeNamespace(int namespaceId) {
      Map<Integer, NamespaceSlice> newMap = new HashMap<Integer, NamespaceSlice>(
          namespaceMap);
      newMap.remove(namespaceId);
      namespaceMap = newMap;
    }
  }
 
  public class FSVolume {
    private final NamespaceMap namespaceMap;
    private final File currentDir;    // <StorageDirectory>/current
    private final DF usage;          
    private final long reserved;
    private final FSDataset dataset;
    private DU dfsUsage;
    private final ExecutorService nativeIOExecutor;
   
    FSVolume(FSDataset dataset, File currentDir, Configuration conf) throws IOException {
      this.currentDir = currentDir;
      File parent = currentDir.getParentFile();
      this.usage = new DF(parent, conf);
      this.reserved = usage.getReserved();
      this.dataset = dataset;
      this.namespaceMap = new NamespaceMap();
      this.dfsUsage = new DU(currentDir, conf);
      this.dfsUsage.start();
      this.nativeIOExecutor = Executors.newSingleThreadExecutor();
    }
   
    public Future<?> submitNativeIOTask(Runnable task) {
      return nativeIOExecutor.submit(task);
    }

    /**
     * It is the only method a caller is supposed to access namespaceMap.
     * This method will return a immutable map. It is a snapshot.
     * @return
     */
    private Map<Integer, NamespaceSlice> getNamespaceMapSnapshot() {
      return namespaceMap.getNamespaceMapSnapshot();
    }

    NamespaceSlice getNamespaceSlice(int namespaceId){
      return getNamespaceMapSnapshot().get(namespaceId);
    }
   
    /** Return storage directory corresponding to the volume */
    public File getDir() {
      return currentDir.getParentFile();
    }
       
    public File getBlockCrcFile(int namespaceId) {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns == null) {
        return null;
      }
      return new File(ns.getDirectory(), Storage.STORAGE_BLOCK_CRC);
    }

    public File getBlockCrcTmpFile(int namespaceId) {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns == null) {
        return null;
      }
      return new File(ns.getDirectory(), Storage.STORAGE_TMP_BLOCK_CRC);
    }

    public File getCurrentDir() {
      return currentDir;
    }
   
    public File getRbwDir(int namespaceId) throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      return ns.getRbwDir();
    }
   
    void setNamespaceBlockCrcLoaded(int namespaceId, boolean loaded) {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns != null) {
        ns.setBlockCrcFileLoaded(loaded);
      }
    }

    boolean isNamespaceBlockCrcLoaded(int namespaceId) {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns != null) {
        return ns.isBlockCrcFileLoaded();
      } else {
        // if the namespace is not added
        return false;
      }
    }
   
    void decDfsUsed(int namespaceId, long value) {
      // this lock is put in FSVolume since it is called only ReplicaFileDeleteWork
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns != null) {
        ns.decDfsUsed(value);
      }
    }
   
    long getDfsUsed() throws IOException {
      long dfsUsed = 0;
      for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) {
        dfsUsed += ns.getDfsUsed();
      }
      return dfsUsed;
    }
   
    long getNSUsed(int namespaceId) throws IOException {
      return getNamespaceMapSnapshot().get(namespaceId).getDfsUsed();
    }
       
    long getCapacity() throws IOException {
      if (reserved > usage.getCapacity()) {
        return 0;
      }

      return usage.getCapacity()-reserved;
    }
     
    long getAvailable() throws IOException {
      long remaining = getCapacity()-getDfsUsed();
      long available = usage.getAvailable();
      if (remaining>available) {
        remaining = available;
      }
      return (remaining > 0) ? remaining : 0;
    }
   
    long getReserved() {
      return this.reserved;
    }
     
    String getMount() throws IOException {
      return usage.getMount();
    }

    String getFileSystem() throws IOException {
      return usage.getFilesystem();
    }

    File addBlock(int namespaceId, Block b, File f, boolean inlineChecksum,
        int checksumType, int bytesPerChecksum) throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      return ns.addBlock(b, f, inlineChecksum, checksumType, bytesPerChecksum);
    }
     
    void checkDirs() throws DiskErrorException {
      for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) {
        ns.checkDirs();
      }
    }
   
    /**
     * Temporary files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createTmpFile(int namespaceId, Block b) throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      return ns.createTmpFile(b);
    }
   
    File getTmpFile(int namespaceId, Block b) throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      return ns.getTmpFile(b);
    }
   
    /**
     * Temporary files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createTmpFile(int namespaceId, Block b, boolean replicationRequest,
        boolean inlineChecksum, int checksumType, int bytesPerChecksum)
        throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      return ns.createTmpFile(b, replicationRequest, inlineChecksum,
          checksumType, bytesPerChecksum);
    }
   
    /**
     * Files used for copy-on-write. They need recovery when datanode
     * restarts.
     */
    File createDetachFile(int namespaceId, Block b, String filename) throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      return ns.createDetachFile(b);
    }
   
    public void addNamespace(int namespaceId, String nsDir, Configuration conf, boolean supportAppends)
        throws IOException {
      File nsdir = new File(currentDir, nsDir);
      NamespaceSlice ns = new NamespaceSlice(namespaceId, this, nsdir, conf, supportAppends);
      namespaceMap.addNamespace(namespaceId, ns);
    }
   
    void getBlocksBeingWrittenInfo(int namespaceId,
        LightWeightHashSet<Block> blockSet) throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns == null) {
        return;
      }
      ns.getBlocksBeingWrittenInfo(blockSet);
      return;
    }

    public void shutdownNamespace(int namespaceId) {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      if (ns != null) {
        this.namespaceMap.removeNamespace(namespaceId);
        ns.shutdown();
      }
    }

    void getBlockInfo(int namespaceId, LightWeightHashSet<Block> blockSet)
        throws IOException {
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      ns.getBlockInfo(blockSet);
      return;
    }
   
    public void shutdown() {
      for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) {
        ns.shutdown();
      }
      dfsUsage.shutdown();
      nativeIOExecutor.shutdownNow();
    }
   
    void clearPath(int namespaceId, File f) throws IOException{
      NamespaceSlice ns = getNamespaceSlice(namespaceId);
      ns.clearPath(f);
      return;
    }
   
    public String toString() {
      return currentDir.getAbsolutePath();
    }
  }

  /**
   * This class maintain a list of FSVolume objects.
   * Only three operations are supported: add volumes, remove volumes,
   * and get a snapshot of the list of the volumes, which is an immutable
   * object.
   */
  static class FSVolumeList {
    volatile FSVolume[] fsVolumes = null;
   
    public FSVolumeList(FSVolume[] volumes) {
      fsVolumes = volumes;
    }
   
    public synchronized void addVolumes(FSVolume[] volArray) {
      if (volArray == null || volArray.length == 0) {
        return;
      }

      int size = fsVolumes.length + volArray.length;
      FSVolume fsvs[] = new FSVolume[size];
      int idx = 0;
      for (; idx < fsVolumes.length; idx++) {
        fsvs[idx] = fsVolumes[idx];
      }
      for (; idx < size; idx++) {
        fsvs[idx] = volArray[idx - fsVolumes.length];
      }
      fsVolumes = fsvs;
    }
   
    public synchronized void removeVolumes(List<FSVolume> removed_vols) {
      // repair array - copy non null elements
      int removed_size = (removed_vols == null) ? 0 : removed_vols.size();
      if (removed_size > 0) {
        FSVolume fsvs[] = new FSVolume[fsVolumes.length - removed_size];
        for (int idx = 0, idy = 0; idx < fsVolumes.length; idx++) {
          if (!removed_vols.contains(fsVolumes[idx])) {
            fsvs[idy] = fsVolumes[idx];
            idy++;
          }
        }
        fsVolumes = fsvs; // replace array of volumes
      }
    }
   
    public FSVolume[] getVolumeListSnapshot() {
      return fsVolumes;
    }
  }

  static class FSVolumeSet {
    final FSVolumeList volumeList;
    int curVolume = 0;

    ExecutorService scannersExecutor;
    boolean supportAppends;

    private FSVolumeSet(FSVolume[] volumes, int threads, boolean supportAppends) {
      this.volumeList = new FSVolumeList(volumes);
      this.supportAppends = supportAppends;
      if (threads > 1) {
        scannersExecutor = Executors.newFixedThreadPool(threads);
      }
    }
   
    public boolean isValidDir(File currentDir) {
      FSVolume[] volumes = this.getVolumes();
      for (int idx = 0; idx < volumes.length; idx++) {
        if (volumes[idx].getCurrentDir().equals(currentDir)) {
          return true;
        }
      }
      return false;
    }
   
    protected void addVolumes(FSVolume[] volArray) {
      volumeList.addVolumes(volArray);
    }
   
    protected int numberOfVolumes() {
      return getVolumes().length;
    }

    public FSVolume[] getVolumes() {
      return volumeList.getVolumeListSnapshot();
    }
   
    boolean isValid(FSVolume volume) {
      for (FSVolume vol : volumeList.getVolumeListSnapshot()) {
        if (vol == volume) {
          return true;
        }
      }
      return false;
    }
     
    private FSVolume getNextVolume(long blockSize) throws IOException {
      FSVolume[] volumes = this.getVolumes();

      if(volumes.length < 1) {
        throw new DiskOutOfSpaceException("No more available volumes");
      }
     
      // since volumes could've been removed because of the failure
      // make sure we are not out of bounds
      if (curVolume >= volumes.length) {
        curVolume = 0;
      }

      int startVolume = curVolume;

      while (true) {
        FSVolume volume = volumes[curVolume];
        curVolume = (curVolume + 1) % volumes.length;
        if (volume.getAvailable() > blockSize) {
          return volume;
        }
        if (curVolume == startVolume) {
          throw new DiskOutOfSpaceException(
              "Insufficient space for an additional block");
        }
      }
    }
     
    private long getDfsUsed() throws IOException {
      long dfsUsed = 0L;
      FSVolume[] volumes = this.getVolumes();

      for (int idx = 0; idx < volumes.length; idx++) {
        dfsUsed += volumes[idx].getDfsUsed();
      }
      return dfsUsed;
    }

    private long getNSUsed(int namespaceId) throws IOException {
      long dfsUsed = 0L;
      FSVolume[] volumes = this.getVolumes();

      for (int idx = 0; idx < volumes.length; idx++) {
        dfsUsed += volumes[idx].getNSUsed(namespaceId);
      }
      return dfsUsed;
    }
   
    private long getCapacity() throws IOException {
      long capacity = 0L;
      FSVolume[] volumes = this.getVolumes();

      for (int idx = 0; idx < volumes.length; idx++) {
        capacity += volumes[idx].getCapacity();
      }
      return capacity;
    }
     
    private long getRemaining() throws IOException {
      long remaining = 0L;
      FSVolume[] volumes = this.getVolumes();

      for (int idx = 0; idx < volumes.length; idx++) {
        remaining += volumes[idx].getAvailable();
      }
      return remaining;
    }
   
    private void getBlocksBeingWrittenInfo(int namespaceId,
        LightWeightHashSet<Block> blockSet) throws IOException {
      long startTime = System.currentTimeMillis();
      FSVolume[] volumes = this.getVolumes();

      if (scannersExecutor != null) {
        synchronized(scannersExecutor) {
          List<Future<LightWeightHashSet<Block>>> builders =
              new ArrayList<Future<LightWeightHashSet<Block>>>();
          for (int idx = 0; idx < volumes.length; idx++) {
            builders.add(scannersExecutor
                .submit(new BlocksBeingWrittenInfoBuilder(volumes[idx],
                    namespaceId)));
          }
          for (Future<LightWeightHashSet<Block>> future : builders) {
            try {
              blockSet.addAll(future.get());
            } catch (ExecutionException ex) {
              DataNode.LOG.error(
                  "Error generating block being written info from volumes ",
                  ex.getCause());
              throw new IOException(ex);
            } catch (InterruptedException iex) {
              DataNode.LOG.error(
                  "Error waiting for generating block being written info", iex);
              throw new IOException(iex);
            }
          }
        }
      } else {
        for (int idx = 0; idx < volumes.length; idx++) {
          volumes[idx].getBlocksBeingWrittenInfo(namespaceId, blockSet);
        }
      }
      long scanTime = (System.currentTimeMillis() - startTime)/1000;
      DataNode.LOG.info("Finished generating blocks being written report for " +
      volumes.length + " volumes in " + scanTime + " seconds");
    }
     
    private void getBlockInfo(int namespaceId, LightWeightHashSet<Block> blockSet) {
      long startTime = System.currentTimeMillis();
      FSVolume[] volumes = this.getVolumes();

      if (scannersExecutor != null) {
        synchronized (scannersExecutor) {
          List<Future<LightWeightHashSet<Block>>> builders =
              new ArrayList<Future<LightWeightHashSet<Block>>>();
          for (int idx = 0; idx < volumes.length; idx++) {
            builders.add(scannersExecutor.submit(new BlockInfoBuilder(
                volumes[idx], namespaceId)));
          }
          for (Future<LightWeightHashSet<Block>> future : builders) {
            try {
              blockSet.addAll(future.get());
            } catch (ExecutionException ex) {
              DataNode.LOG.error("Error scanning volumes ", ex.getCause());
            } catch (InterruptedException iex) {
              DataNode.LOG.error("Error waiting for scan", iex);
            }
          }
        }
      } else {
        for (int idx = 0; idx < volumes.length; idx++) {
          try
            volumes[idx].getBlockInfo(namespaceId, blockSet);
          } catch (IOException e) {
            DataNode.LOG.error("Error scanning volumes ", e.getCause());
          }
        }
      }
      long scanTime = (System.currentTimeMillis() - startTime)/1000;
      DataNode.LOG.info("Finished generating block report for " +
          volumes.length + " volumes in " + scanTime + " seconds");
    }
     
    /**
     * goes over all the volumes and checkDir eachone of them
     * if one throws DiskErrorException - removes from the list of active
     * volumes.
     * @return list of all the removed volumes
     */
    private List<FSVolume> checkDirs() {     
      List<FSVolume> removed_vols = null;

      FSVolume[] fsVolumes = this.getVolumes();
      for (int idx = 0; idx < fsVolumes.length; idx++) {
        FSVolume fsv = fsVolumes[idx];
        try {
          fsv.checkDirs();
        } catch (DiskErrorException e) {
          DataNode.LOG.warn("Removing failed volume " + fsv + ": ", e);
          if (removed_vols == null) {
            removed_vols = new ArrayList<FSVolume>();
            removed_vols.add(fsVolumes[idx]);
          }
        }
      }

      if (removed_vols != null && removed_vols.size() > 0) {
        volumeList.removeVolumes(removed_vols);
        DataNode.LOG.info("Completed FSVolumeSet.checkDirs. Removed="
            + removed_vols.size() + "volumes. List of current volumes: "
            + toString());
      }

      return removed_vols;
    }
   
    private List<FSVolume> removeBVolumes(List<File> directories) {
      ArrayList<FSVolume> removed_vols = new ArrayList<FSVolume>();
      if (directories != null && directories.size() > 0) {
        FSVolume[] fsVolumes = this.getVolumes();
        for(int idx = 0; idx < fsVolumes.length; idx++) {
          FSVolume fsv = fsVolumes[idx];
          if(directories.contains(fsv.getDir())) {
            removed_vols.add(fsv);
          }
        }
        volumeList.removeVolumes(removed_vols);
        DataNode.LOG.info("Completed FSVolumeSet.removeVolumes. Removed="
            + removed_vols.size() + "volumes. List of current volumes: "
            + toString());
      }
      return removed_vols;
    }
   
    private void addNamespace(int namespaceId, String nsDir, Configuration conf)
        throws IOException {
      FSVolume[] volumes = this.getVolumes();

      for (FSVolume v : volumes) {
        v.addNamespace(namespaceId, nsDir, conf, supportAppends);
      }
    }

    private void removeNamespace(int namespaceId) {
      FSVolume[] volumes = this.getVolumes();

      for (FSVolume v : volumes) {
        v.shutdownNamespace(namespaceId);
      }
    }
   
    public String toString() {
      StringBuffer sb = new StringBuffer();
      FSVolume[] volumes = this.getVolumes();

      for (int idx = 0; idx < volumes.length; idx++) {
        sb.append(volumes[idx].toString());
        if (idx != volumes.length - 1) { sb.append(","); }
      }
      return sb.toString();
    }
  }
 
  private static class BlockInfoBuilder implements Callable<LightWeightHashSet<Block>> {
    FSVolume volume;
    int namespaceId;

    public BlockInfoBuilder(FSVolume volume, int namespaceId) {
      this.volume = volume;
      this.namespaceId = namespaceId;
    }

    @Override
    public LightWeightHashSet<Block> call() throws Exception {
      LightWeightHashSet<Block> result = new LightWeightHashSet<Block>();
      volume.getBlockInfo(namespaceId, result);
      return result;
    }
  }

  private static class BlocksBeingWrittenInfoBuilder implements
      Callable<LightWeightHashSet<Block>> {
    FSVolume volume;
    int namespaceId;

    public BlocksBeingWrittenInfoBuilder(FSVolume volume, int namespaceId) {
      this.volume = volume;
      this.namespaceId = namespaceId;
    }

    @Override
    public LightWeightHashSet<Block> call() throws Exception {
      LightWeightHashSet<Block> result = new LightWeightHashSet<Block>();
      volume.getBlocksBeingWrittenInfo(namespaceId, result);
      return result;
    }
  }
  //////////////////////////////////////////////////////
  //
  // FSDataSet
  //
  //////////////////////////////////////////////////////

  //Find better place?
  public static final String METADATA_EXTENSION = ".meta";
  public static final short FORMAT_VERSION_NON_INLINECHECKSUM = 1;
  public static final short FORMAT_VERSION_INLINECHECKSUM = 2;
  public static final String DELETE_FILE_EXT = "toDelete.";

  static class ActiveFile implements ReplicaToRead, ReplicaBeingWritten,
      Cloneable {
    static final long UNKNOWN_SIZE = -1;
   
    DatanodeBlockInfo datanodeBlockInfo;
    final List<Thread> threads = new ArrayList<Thread>(2);
    private volatile long bytesReceived;
    private volatile long bytesAcked;
    private volatile long bytesOnDisk;
    private volatile boolean finalized;
    private volatile BlockCrcUpdater crcUpdater;

    /**
     * Set to true if this file was recovered during datanode startup.
     * This may indicate that the file has been truncated (eg during
     * underlying filesystem journal replay)
     */
    final boolean wasRecoveredOnStartup;

    ActiveFile(DatanodeBlockInfo datanodeBlockInfo, List<Thread> list,
        long expectedSize, boolean enable) throws IOException {
      this(datanodeBlockInfo, false, expectedSize, enable);
      if (list != null) {
        threads.addAll(list);
      }
      threads.add(Thread.currentThread());
    }

    /**
     * Create an ActiveFile from a file on disk during DataNode startup.
     * This factory method is just to make it clear when the purpose
     * of this constructor is.
     * @throws IOException
     */
    private ActiveFile(DatanodeBlockInfo datanodeBlockInfo, boolean recovery,
                       long expectedSize, boolean enable)
        throws IOException {
      this.datanodeBlockInfo = datanodeBlockInfo;
      long sizeFromDisk;

      if (!isInlineChecksum()) {
        sizeFromDisk = getDataFile().length();
      } else {
        GenStampAndChecksum sac = BlockInlineChecksumReader
            .getGenStampAndChecksumFromInlineChecksumFile(getDataFile()
                .getName());
        sizeFromDisk = BlockInlineChecksumReader.getBlockSizeFromFileLength(
            getDataFile().length(), sac.checksumType, sac.bytesPerChecksum);
      }
      if (expectedSize != UNKNOWN_SIZE && sizeFromDisk != expectedSize) {
        throw new IOException("File " + getDataFile()
            + " on disk size " + sizeFromDisk + " doesn't match expected size "
            + expectedSize);
      }
      bytesReceived = bytesAcked = bytesOnDisk = sizeFromDisk;
      crcUpdater = new BlockCrcUpdater(this.getBytesPerChecksum(),
          enable && bytesReceived == 0);
      wasRecoveredOnStartup = recovery;
      finalized = false;     
    }

    @Override
    public long getBytesVisible() {
      return bytesAcked;
    }

    public void setBytesAcked(long value) {
      bytesAcked = value;
    }

    @Override
    public long getBytesWritten() {
      return bytesOnDisk;
    }

    public void setBytesOnDisk(long value) {
      bytesOnDisk = value;
    }
   
    public long getBytesReceived() {
      return bytesReceived;
    }

    public void setBytesReceived(long length) {
      bytesReceived = length;
    }

    @Override
    public File getDataFileToRead() {
      return datanodeBlockInfo.getDataFileToRead();
    }

    private File getDataFile() {
      return datanodeBlockInfo.getBlockDataFile().getFile();
    }
   
    public String toString() {
      return getClass().getSimpleName() + "(file=" + getDataFile()
          + ", threads=" + threads + ")";
    }
   
    public ActiveFile getClone() throws CloneNotSupportedException {
      return (ActiveFile) super.clone();
    }

    @Override
    public boolean isInlineChecksum() {
      return datanodeBlockInfo.isInlineChecksum();
    }

    @Override
    public int getChecksumType() {
      return datanodeBlockInfo.getChecksumType();
    }

    @Override
    public int getBytesPerChecksum() {
      return datanodeBlockInfo.getBytesPerChecksum();
    }

    @Override
    public InputStream getBlockInputStream(DataNode datanode, long offset)
        throws IOException {
      return datanodeBlockInfo.getBlockInputStream(datanode, offset);
    }

    @Override
    public boolean isFinalized() {
      return finalized;
    }

    protected void blockFinalize() {
      this.finalized = true;
    }

    @Override
    public int getBlockCrc() throws IOException{
      throw new IOException("Block not finalized.");
    }

    @Override
    public void updateBlockCrc(long offset, int length,
        int crc) {
      crcUpdater.updateBlockCrc(offset, length, crc);
    }

    @Override
    public boolean hasBlockCrcInfo() {
      return false;
    }

    BlockCrcUpdater getCrcUpdater() {
      return crcUpdater;
    }

    @Override
    public BlockDataFile getBlockDataFile() throws IOException {
      return datanodeBlockInfo.getBlockDataFile();
    }
  } 
 
  /**
   * Check if a file is scheduled for deletion
   * name should be obtained by File.getName()
   */
  static boolean isPendingDeleteFilename(String name) {
    return name.startsWith(DELETE_FILE_EXT);
  }
 
  public Block getStoredBlock(int namespaceId, long blkid) throws IOException {
    return getStoredBlock(namespaceId, blkid, false);
  }
  /** {@inheritDoc} */
  public Block getStoredBlock(int namespaceId, long blkid,
      boolean useOnDiskLength) throws IOException {
    lock.readLock().lock();
    try {
      ReplicaToRead replica = getReplicaToRead(namespaceId, new Block(
          blkid));
      if (replica == null) {
        return null;
      }
      File blockfile = replica.getDataFileToRead();
      if (blockfile == null) {
        return null;
      }
     
      File metafile = null;
      if (!replica.isInlineChecksum()) {
        metafile = BlockWithChecksumFileWriter.findMetaFile(blockfile, true);
        if (metafile == null) {
          return null;
        }
      }
      Block block = new Block(blkid);
      if (useOnDiskLength) {
        block.setNumBytes(replica.getBytesWritten());
      } else {
        block.setNumBytes(replica.getBytesVisible());
      }
      if (replica.isInlineChecksum()) {
        block.setGenerationStamp(BlockInlineChecksumReader
            .getGenerationStampFromInlineChecksumFile(blockfile.getName()));
      } else {
        block.setGenerationStamp(BlockWithChecksumFileReader
            .parseGenerationStampInMetaFile(blockfile, metafile));
      }
      return block;
    } finally {
      lock.readLock().unlock();
    }
  }

  FSVolumeSet volumes;
  private DataNode datanode;
  private Configuration conf;
  private int maxBlocksPerDir = 0;
  private boolean initialized = false;
 
  VolumeMap volumeMap;
  BlockCrcMapFlusher blockCrcMapFlusher;
  Thread blockCrcMapFlusherThread = null;
  static  Random random = new Random();
  FSDatasetAsyncDiskService asyncDiskService;
  ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true);
  private boolean shouldHardLinkBlockCopy;
  private int validVolsRequired;
 
  //this constructor is used to create PersistedSimulatedFSDataset
  public FSDataset() {
  }
 
  /**
   * An FSDataset has a directory where it loads its data files.
   */
  public FSDataset(DataNode datanode, Configuration conf, int numNamespaces){
    this.datanode = datanode;
    this.conf = conf;
    this.maxBlocksPerDir = conf.getInt("dfs.datanode.numblocks", 64);
    volumeMap = new VolumeMap(numNamespaces);
  }
 
  void setDatasetDelta(FSDatasetDeltaInterface stateChangeCallback) {
    volumeMap.setDatasetDelta(stateChangeCallback);
  }
 
  @Override
  public void initialize(DataStorage storage) throws IOException{
    lock.writeLock().lock();
    try{
      if(initialized){
        return;
      }
     
      // The number of volumes required for operation is the total number
      // of volumes configured minus the number of failed volumes we can
      // tolerate.
      String[] dataDirs = DataNode.getListOfDataDirs(conf);
      int volsConfigured = (dataDirs == null) ? 0 : dataDirs.length;
      final int volFailuresTolerated =
        conf.getInt("dfs.datanode.failed.volumes.tolerated", volsConfigured-1);
      this.validVolsRequired = volsConfigured - volFailuresTolerated;
      if (validVolsRequired < 1 || validVolsRequired > storage.getNumStorageDirs()) {
        throw new DiskErrorException("Too many failed volumes - "
                  + "current valid volumes: " + storage.getNumStorageDirs()
                  + ", volumes configured: " + volsConfigured
                  + ", volume failures tolerated: " + volFailuresTolerated );
      }
      File[] roots = new File[storage.getNumStorageDirs()];
      for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
        roots[idx] = storage.getStorageDir(idx).getCurrentDir();
      }
      asyncDiskService = new FSDatasetAsyncDiskService(roots, conf);  
      FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()];
      for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
        volArray[idx] = new FSVolume(this, storage.getStorageDir(idx).getCurrentDir(),
            conf);
        DataNode.LOG.info("FSDataset added volume - "
            + storage.getStorageDir(idx).getCurrentDir());
      }
      int threads = conf.getInt("dfs.datanode.blockscanner.threads", 1);
      volumes = new FSVolumeSet(volArray, threads, datanode.isSupportAppends());
      registerMBean(storage.getStorageID());
      blockCrcMapFlusher = new BlockCrcMapFlusher(datanode, volumeMap,
          volumes, conf.getLong("dfs.block.crc.flush.interval", 600000));
      blockCrcMapFlusherThread = new Thread(blockCrcMapFlusher,
          "Block Crc Flusher");
      blockCrcMapFlusherThread.start();
      initialized = true;
    } finally {
      lock.writeLock().unlock();
    }
    shouldHardLinkBlockCopy = conf.getBoolean("dfs.datanode.blkcopy.hardlink",
        true);
  }

  private class VolumeThread extends Thread {
    private Configuration conf;
    private FSVolume volume;
    private boolean hasError = false;
    private Map<Integer, String> namespaceIdDir;
    private boolean supportAppends;

    private VolumeThread(FSVolume volume,
        Configuration conf,
        Map<Integer, String> namespaceIdDir, boolean supportAppends) {
      this.namespaceIdDir = namespaceIdDir;
      this.volume = volume;
      this.conf = conf;
      this.supportAppends = supportAppends;

    }

    public void run() {
      DataNode.LOG.info("Start building volume: " + volume);
      try {
        for (Integer namespaceId : namespaceIdDir.keySet()) {
          volume.addNamespace(namespaceId, namespaceIdDir.get(namespaceId),
              conf, supportAppends);
        }      
      } catch (IOException ioe) {
        DataNode.LOG.error("Error building volume : " + volume, ioe);
        hasError = true;
      }
      DataNode.LOG.info("Finish building volume for " + volume);
    }
  }
 
  private void createVolumes(FSVolumeSet volumes, DataStorage storage,
      Configuration conf, VolumeMap volumeMap,
      Map<Integer, String> namespaceIdDir) throws IOException {
    FSVolume[] myVolumes = volumes.getVolumes();

    ArrayList<VolumeThread> scanners = new ArrayList<VolumeThread>(
        myVolumes.length);
   
    for(FSVolume volume : myVolumes){
      scanners.add(new VolumeThread(volume, conf,
          namespaceIdDir, volumes.supportAppends));
    }
   
    for(VolumeThread vt : scanners){
      vt.start();
    }
    boolean hasError = false;
    for (VolumeThread vt : scanners) {
      try {
        vt.join();
      } catch (InterruptedException e) {
        throw (InterruptedIOException)new InterruptedIOException().initCause(e);
      }
      if (!hasError && vt.hasError) {
        hasError = true;
      }

    }
    if (hasError) {
      throw new IOException("Error creating volumes");
    }
  }


  /**
   * Return the total space used by dfs datanode
   */
  public long getDfsUsed() throws IOException {
    return volumes.getDfsUsed();
  }
 
  /**
   * Return the total space used by one namespace in dfs datanode
   */
  public long getNSUsed(int namespaceId) throws IOException {
    return volumes.getNSUsed(namespaceId);
  }
 
  /**
   * Return true - if there are still valid volumes
   * on the DataNode
   */
  public boolean hasEnoughResource(){
    return volumes.numberOfVolumes() >= this.validVolsRequired;
  }

  /**
   * Return total capacity, used and unused
   */
  public long getCapacity() throws IOException {
    return volumes.getCapacity();
  }

  /**
   * Return how many bytes can still be stored in the FSDataset
   */
  public long getRemaining() throws IOException {
    return volumes.getRemaining();
  }

  /**
   * Find the block's on-disk length
   */
  public long getFinalizedBlockLength(int namespaceId, Block b) throws IOException {
    DatanodeBlockInfo info = volumeMap.get(namespaceId, b);
    if (info == null) {
      throw new IOException("Can't find block " + b + " in volumeMap");
    }
    return info.getFinalizedSize();
  }

  @Override
  public long getOnDiskLength(int namespaceId, Block b) throws IOException {
    ReplicaToRead rtr = this.getReplicaToRead(namespaceId, b);
    if (rtr == null) {
      throw new IOException("Can't find block " + b + " in volumeMap");
    }
    return rtr.getBytesWritten();
  }

  @Override
  public ReplicaBeingWritten getReplicaBeingWritten(
      int namespaceId, Block b) throws IOException {
    lock.readLock().lock();
    try {
      return volumeMap.getOngoingCreates(namespaceId, b);
    } finally {
      lock.readLock().unlock();
    }
  } 

  /**
   * Get File name for a given block.
   */
  public File getBlockFile(int namespaceId, Block b) throws IOException {
    File f = validateBlockFile(namespaceId, b);
    if (f == null) {
      if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
        InterDatanodeProtocol.LOG
            .debug("b=" + b + ", volumeMap=" + volumeMap);
      }
      throw new IOException("Block " + b + ", namespace= " + namespaceId
          + " is not valid.");
    }
    return f;
  }

  /**
   * Make a copy of the block if this block is linked to an existing
   * snapshot. This ensures that modifying this block does not modify
   * data in any existing snapshots.
   * @param block Block
   * @param numLinks Detach if the number of links exceed this value
   * @throws IOException
   * @return - true if the specified block was detached
   */
  public boolean detachBlock(int namespaceId, Block block, int numLinks) throws IOException {
    DatanodeBlockInfo info = null;

    lock.readLock().lock();
    try {
      info = volumeMap.get(namespaceId, block);
    } finally {
      lock.readLock().unlock();
    }
   
    return info.detachBlock(namespaceId, block, numLinks);
  }

  /** {@inheritDoc} */
  public void updateBlock(int namespaceId, Block oldblock, Block newblock) throws IOException {
    if (oldblock.getBlockId() != newblock.getBlockId()) {
      throw new IOException("Cannot update oldblock (=" + oldblock
          + ") to newblock (=" + newblock + ").");
    }

    // Protect against a straggler updateblock call moving a block backwards
    // in time.
    boolean isValidUpdate =
      (newblock.getGenerationStamp() > oldblock.getGenerationStamp()) ||
      (newblock.getGenerationStamp() == oldblock.getGenerationStamp() &&
       newblock.getNumBytes() == oldblock.getNumBytes());

    if (!isValidUpdate) {
      throw new IOException(
        "Cannot update oldblock=" + oldblock +
        " to newblock=" + newblock + " since generation stamps must " +
        "increase, or else length must not change.");
    }

    for(;;) {
      final List<Thread> threads = tryUpdateBlock(namespaceId, oldblock, newblock);
      if (threads == null) {
        DataNode.LOG.info("Updated Block: namespaceid: " + namespaceId + " oldBlock: "
            + oldblock + " newBlock: " + newblock);
        return;
      }

      DataNode.LOG.info("Waiting other threads to update block: namespaceid: "
          + namespaceId + " oldBlock: " + oldblock + " newBlock: " + newblock);
      interruptAndJoinThreads(threads);
    }
  }

  /**
   * Try to interrupt all of the given threads, and join on them.
   * If interrupted, returns false, indicating some threads may
   * still be running.
   */
  private boolean interruptAndJoinThreads(List<Thread> threads) {
    // interrupt and wait for all ongoing create threads
    for(Thread t : threads) {
      t.interrupt();
    }
    for(Thread t : threads) {
      try {
        t.join();
      } catch (InterruptedException e) {
        DataNode.LOG.warn("interruptOngoingCreates: t=" + t, e);
        return false;
      }
    }
    return true;
  }


  /**
   * Return a list of active writer threads for the given block.
   * @return null if there are no such threads or the file is
   * not being created
   */
  private ArrayList<Thread> getActiveThreads(int namespaceId, Block block) {
    lock.writeLock().lock();
    try {
      //check ongoing create threads
      final ActiveFile activefile = volumeMap.getOngoingCreates(namespaceId, block);
      if (activefile != null && !activefile.threads.isEmpty()) {
        //remove dead threads
        for(Iterator<Thread> i = activefile.threads.iterator(); i.hasNext(); ) {
          final Thread t = i.next();
          if (!t.isAlive()) {
            i.remove();
          }
        }
 
        //return living threads
        if (!activefile.threads.isEmpty()) {
          return new ArrayList<Thread>(activefile.threads);
        }
      }
    } finally {
      lock.writeLock().unlock();
    }
    return null;
  }
 
  private void setDataFileForBlock(int namespaceId, Block block, File newDataFile) {
    DatanodeBlockInfo info = volumeMap.get(namespaceId, block);
    if (info != null) {
      info.getBlockDataFile().setFile(newDataFile);
    }
  }
 
  /**
   * Try to update an old block to a new block.
   * If there are ongoing create threads running for the old block,
   * the threads will be returned without updating the block.
   *
   * @return ongoing create threads if there is any. Otherwise, return null.
   */
  private List<Thread> tryUpdateBlock(int namespaceId,
      Block oldblock, Block newblock) throws IOException {
    lock.writeLock().lock();
    try {
      //check ongoing create threads
      ArrayList<Thread> activeThreads = getActiveThreads(namespaceId, oldblock);
      if (activeThreads != null) {
        return activeThreads;
      }

      DatanodeBlockInfo binfo = volumeMap.get(namespaceId, oldblock);
      if (binfo == null) {
        throw new IOException("Block " + oldblock
            + " doesn't exist or has been recovered to a new generation ");
      }

      File blockFile = binfo.getBlockDataFile().getFile();
 
      long oldgs;
      File oldMetaFile = null;
      if (binfo.isInlineChecksum()) {
        oldgs = BlockInlineChecksumReader
            .getGenerationStampFromInlineChecksumFile(blockFile.getName());
      } else {
        oldMetaFile = BlockWithChecksumFileWriter.findMetaFile(blockFile);
        oldgs = BlockWithChecksumFileReader.parseGenerationStampInMetaFile(
            blockFile, oldMetaFile);
      }
     
    // First validate the update

      //update generation stamp
      if (oldgs > newblock.getGenerationStamp()) {
        throw new IOException("Cannot update block (id=" + newblock.getBlockId()
            + ") generation stamp from " + oldgs
            + " to " + newblock.getGenerationStamp());
      }
     
      //update length
      if (newblock.getNumBytes() > oldblock.getNumBytes()) {
        throw new IOException("Cannot update block file (=" + blockFile
            + ") length from " + oldblock.getNumBytes() + " to " + newblock.getNumBytes());
      }

      // Although we've waited for the active threads all dead before updating
      // the map so there should be no data race there, we still create new
      // ActiveFile object to make sure in case another thread holds it,
      // it won't cause any problem for us.
      //
      try {
        volumeMap.copyOngoingCreates(namespaceId, oldblock);
      } catch (CloneNotSupportedException e) {
        // It should never happen.
        throw new IOException("Cannot clone ActiveFile object", e);
      }

      // Now perform the update
      File tmpMetaFile = null;
      if (!binfo.isInlineChecksum()) {
        // rename meta file to a tmp file
        tmpMetaFile = new File(oldMetaFile.getParent(),
            oldMetaFile.getName() + "_tmp" + newblock.getGenerationStamp());
        if (!oldMetaFile.renameTo(tmpMetaFile)) {
          throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
        }
      }

      long oldBlockLength;
      if (!binfo.isInlineChecksum()) {
        oldBlockLength = blockFile.length();
      } else {
        oldBlockLength = BlockInlineChecksumReader.getBlockSizeFromFileLength(
            blockFile.length(), binfo.getChecksumType(),
            binfo.getBytesPerChecksum());
      }
      ActiveFile file = null;
      if (newblock.getNumBytes() < oldBlockLength) {
        if (!binfo.isInlineChecksum()) {
          new BlockWithChecksumFileWriter(binfo.getBlockDataFile(), tmpMetaFile)
            .truncateBlock(oldBlockLength, newblock.getNumBytes());
        } else {
          new BlockInlineChecksumWriter(binfo.getBlockDataFile(), binfo.getChecksumType(),
              binfo.getBytesPerChecksum(), datanode.writePacketSize)
            .truncateBlock(newblock.getNumBytes());
        }
        file = volumeMap.getOngoingCreates(namespaceId, oldblock);
        if (file != null) {
          file.setBytesAcked(newblock.getNumBytes());
          file.setBytesOnDisk(newblock.getNumBytes());
          file.setBytesReceived(newblock.getNumBytes());
        } else {
          // This should never happen unless called from unit tests.
          binfo.syncInMemorySize();
        }
      }

      String newDataFileName;
      if (!binfo.isInlineChecksum()) {
        //rename the tmp file to the new meta file (with new generation stamp)
        File newMetaFile = BlockWithChecksumFileWriter.getMetaFile(blockFile, newblock);
        if (!tmpMetaFile.renameTo(newMetaFile)) {
          throw new IOException("Cannot rename tmp meta file to " + newMetaFile);
        }
      } else {
        newDataFileName = BlockInlineChecksumWriter.getInlineChecksumFileName(
            newblock, binfo.getChecksumType(), binfo.getBytesPerChecksum());
        File newDataFile = new File(blockFile.getParent(), newDataFileName);
        if (!blockFile.renameTo(newDataFile)) {
          throw new IOException("Cannot rename data file to " + newDataFileName);
        }
        // fsyncIfPossible parent directory to persist rename.
        if (datanode.syncOnClose) {
          NativeIO.fsyncIfPossible(newDataFile.getParent());
        }
        setDataFileForBlock(namespaceId, oldblock, newDataFile);
      }
 
      if(volumeMap.getOngoingCreates(namespaceId, oldblock) != null){
        ActiveFile af = volumeMap.removeOngoingCreates(namespaceId, oldblock);
        volumeMap.addOngoingCreates(namespaceId, newblock, af);
      }
      volumeMap.update(namespaceId, oldblock, newblock);
 
      // paranoia! verify that the contents of the stored block
      // matches the block file on disk.
      validateBlockMetadata(namespaceId, newblock);
      return null;
    } finally {
      lock.writeLock().unlock();
    }
  }

  private final static String DISK_ERROR = "Possible disk error on file creation: ";
  /** Get the cause of an I/O exception if caused by a possible disk error
   * @param ioe an I/O exception
   * @return cause if the I/O exception is caused by a possible disk error;
   *         null otherwise.
   */
  static IOException getCauseIfDiskError(IOException ioe) {
    if (ioe.getMessage()!=null && ioe.getMessage().startsWith(DISK_ERROR)) {
      return (IOException)ioe.getCause();
    } else {
      return null;
    }
  }

  /**
   * Start writing to a block file
   * If isRecovery is true and the block pre-exists, then we kill all
      volumeMap.put(b, v);
      volumeMap.put(b, v);
   * other threads that might be writing to this block, and then reopen the file.
   * If replicationRequest is true, then this operation is part of a block
   * replication request.
   */
  public DatanodeBlockWriter writeToBlock(int namespaceId, Block b, Block newBlock,
      boolean isRecovery, boolean replicationRequest, int checksumType, int bytesPerChecksum)
      throws IOException {
    //
    // Make sure the block isn't a valid one - we're still creating it!
    //
    if (isValidBlock(namespaceId, b, false)) {
      if (!isRecovery) {
        throw new BlockAlreadyExistsException("Block " + b + " is valid, and cannot be written to.");
      }
      // If the block was successfully finalized because all packets
      // were successfully processed at the Datanode but the ack for
      // some of the packets were not received by the client. The client
      // re-opens the connection and retries sending those packets.
      // The other reason is that an "append" is occurring to this block.
      detachBlock(namespaceId, b, 1);
    }
    long blockSize = b.getNumBytes();

    //
    // Serialize access to /tmp, and check if file already there.
    //
    File f = null;
    List<Thread> threads = null;
    long expectedFileSize = ActiveFile.UNKNOWN_SIZE;
    boolean inlineChecksum = datanode.useInlineChecksum;
    DatanodeBlockInfo binfo;
    FSVolume v = null;
    Block targetBlock = b;
    if (newBlock != null && newBlock != b) {
      targetBlock = newBlock;
    }

    lock.writeLock().lock();
    try {

      //
      // Is it already in the create process?
      //
      ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b);
      if (activeFile != null) {
        f = activeFile.getDataFile();
        threads = activeFile.threads;
        expectedFileSize = activeFile.getBytesWritten();
        inlineChecksum = activeFile.isInlineChecksum();

        if (!isRecovery) {
          throw new BlockAlreadyExistsException("Block " + b +
                                  " has already been started (though not completed), and thus cannot be created.");
        } else {
          for (Thread thread:threads) {
            thread.interrupt();
          }
        }
        volumeMap.removeOngoingCreates(namespaceId, b);
      }
      if (!isRecovery) {
        if (newBlock != null && b != newBlock) {
          throw new IOException("newBlock is not allowed except append case. ");
        }
        v = volumes.getNextVolume(blockSize);
        // create temporary file to hold block in the designated volume
        f = createTmpFile(namespaceId, v, b, replicationRequest,
            inlineChecksum, checksumType, bytesPerChecksum);
      } else if (f != null) {
        DataNode.LOG.info("Reopen already-open Block for append " + b);
        if (newBlock != null && b != newBlock) {
          throw new IOException("newBlock is not allowed except append case. ");
        }
        // create or reuse temporary file to hold block in the designated volume
        DatanodeBlockInfo oldBinfo = volumeMap.get(namespaceId, b);
        inlineChecksum = oldBinfo.isInlineChecksum();
        v = oldBinfo.getBlockDataFile().getVolume();
        volumeMap.add(namespaceId, b, new DatanodeBlockInfo(v, f,
            DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType,
            bytesPerChecksum, false, 0));
      } else {
        // reopening block for appending to it.
        DataNode.LOG.info("Reopen Block for append " + b);
        if (newBlock == null) {
          throw new IOException(
              "newBlock is required for append af file to write. ");
        }
        DatanodeBlockInfo oldBinfo = volumeMap.get(namespaceId, b);
        inlineChecksum = oldBinfo.isInlineChecksum();
        v = oldBinfo.getBlockDataFile().getVolume();
        f = createTmpFile(namespaceId, v, newBlock, replicationRequest,
            inlineChecksum, checksumType, bytesPerChecksum);
        File blkfile = getBlockFile(namespaceId, b);

        if (!inlineChecksum) {
          File oldmeta = BlockWithChecksumFileReader.getMetaFile(this, namespaceId, b);
          File newmeta = BlockWithChecksumFileWriter.getMetaFile(f, newBlock);

          // rename meta file to tmp directory
          DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta);
          if (!oldmeta.renameTo(newmeta)) {
            throw new IOException("Block " + b + " reopen failed. "
                + " Unable to move meta file  " + oldmeta + " to tmp dir "
                + newmeta);
          }
        }

        // rename block file to tmp directory
        DataNode.LOG.debug("Renaming " + blkfile + " to " + f);
        if (!blkfile.renameTo(f)) {
          if (!f.delete()) {
            throw new IOException("Block " + b + " reopen failed. " +
                                  " Unable to remove file " + f);
          }
          if (!blkfile.renameTo(f)) {
            throw new IOException("Block " + b + " reopen failed. " +
                                  " Unable to move block file " + blkfile +
                                  " to tmp dir " + f);
          }
        }
        // fsyncIfPossible parent directory to persist rename.
        if (datanode.syncOnClose) {
          NativeIO.fsyncIfPossible(blkfile.getParent());
        }
      }
      if (f == null) {
        DataNode.LOG.warn("Block " + b + " reopen failed " +
                          " Unable to locate tmp file.");
        throw new IOException("Block " + b + " reopen failed " +
                              " Unable to locate tmp file.");
      }
      // If this is a replication request, then this is not a permanent
      // block yet, it could get removed if the datanode restarts. If this
      // is a write or append request, then it is a valid block.
      if (replicationRequest) {
        binfo = new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED,
            false, inlineChecksum, checksumType, bytesPerChecksum, false, 0);
      } else {
        binfo = new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED,
            true, inlineChecksum, checksumType, bytesPerChecksum, false, 0);
      }
      if (newBlock != null && newBlock != b) {
        volumeMap.remove(namespaceId, b);
      }

      volumeMap.add(namespaceId, targetBlock, binfo);
      volumeMap.addOngoingCreates(namespaceId, targetBlock, new ActiveFile(binfo,
          threads, expectedFileSize, datanode.updateBlockCrcWhenWrite));
     
    } finally {
      lock.writeLock().unlock();
    }

    try {
      if (threads != null) {
        for (Thread thread:threads) {
          thread.join();
        }
      }
    } catch (InterruptedException e) {
      throw new IOException("Recovery waiting for thread interrupted.");
    }

    //
    // Finally, allow a writer to the block file
    // REMIND - mjc - make this a filter stream that enforces a max
    // block size, so clients can't go crazy
    //
    if (DataNode.LOG.isDebugEnabled()) {
      DataNode.LOG.debug("writeTo blockfile is " + f + " of size " + f.length());
    }
    if (inlineChecksum) {
      return new BlockInlineChecksumWriter(binfo.getBlockDataFile(),
          checksumType, bytesPerChecksum, datanode.writePacketSize);
    } else {
      File metafile = BlockWithChecksumFileWriter.getMetaFile(f, targetBlock);
      if (DataNode.LOG.isDebugEnabled()) {
        DataNode.LOG.debug("writeTo metafile is " + metafile + " of size "
            + metafile.length());
      }
      return new BlockWithChecksumFileWriter(binfo.getBlockDataFile(), metafile);
    }
  }
 
  File createTmpFile(int namespaceId, FSVolume vol, Block blk,
      boolean replicationRequest, boolean inlineChecksum, int checksumType,
      int bytePerChecksum) throws IOException {
    lock.writeLock().lock();
    try {
        if ( vol == null ) {
          vol = volumeMap.get(namespaceId, blk).getBlockDataFile().getVolume();
          if ( vol == null ) {
            throw new IOException("Could not find volume for block " + blk);
          }
        }
      return vol.createTmpFile(namespaceId, blk, replicationRequest,
          inlineChecksum, checksumType, bytePerChecksum);
    } finally {
      lock.writeLock().unlock();
    }
  }

  //
  // REMIND - mjc - eventually we should have a timeout system
  // in place to clean up block files left by abandoned clients.
  // We should have some timer in place, so that if a blockfile
  // is created but non-valid, and has been idle for >48 hours,
  // we can GC it safely.
  //

  /**
   * Complete the block write!
   */
  @Override // FSDatasetInterface
  public void finalizeBlock(int namespaceId, Block b) throws IOException {
    finalizeBlockInternal(namespaceId, b, true);
  }

  @Override
  public void finalizeBlockIfNeeded(int namespaceId, Block b) throws IOException {
    finalizeBlockInternal(namespaceId, b, true);
  }
 
  /**
   * Complete the block write!
   */
  public void finalizeBlockInternal(int namespaceId, Block b, boolean reFinalizeOk)
    throws IOException {
    lock.writeLock().lock();
    DatanodeBlockInfo binfo = volumeMap.get(namespaceId, b);
    try {
      ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b);
      if (activeFile == null) {
        if (reFinalizeOk) {
          return;
        } else {
          throw new IOException("Block " + b + " is already finalized.");
        }
      }
      File f = activeFile.getDataFile();
      if (f == null || !f.exists()) {
        throw new IOException("No temporary file " + f + " for block " + b);
      }
      FSVolume v = binfo.getBlockDataFile().getVolume();
      if (v == null) {
        throw new IOException("No volume for temporary file " + f +
                              " for block " + b);
      }
         
      File dest = null;
      dest = v.addBlock(namespaceId, b, f, activeFile.isInlineChecksum(),
          binfo.getChecksumType(), binfo.getBytesPerChecksum());
      volumeMap.add(
          namespaceId,
          b,
          new DatanodeBlockInfo(v, dest, activeFile.getBytesWritten(), true,
              activeFile.isInlineChecksum(), binfo.getChecksumType(),
              binfo.getBytesPerChecksum(),
              activeFile.getCrcUpdater().isCrcValid(activeFile.getBytesWritten()),
              activeFile.getCrcUpdater().getBlockCrc()));
      ActiveFile af = volumeMap.removeOngoingCreates(namespaceId, b);
      af.blockFinalize();
    } finally {
      lock.writeLock().unlock();
    }
  }

  private boolean isBlockFinalizedInternal(int namespaceId, Block b,
      boolean validate) {
    DatanodeBlockInfo blockInfo = volumeMap.get(namespaceId, b);
   
    // We skip the check for validate case to avoid redundant codes
    // but keep old codes' behavior. Though it looks like a bug, but we
    // would fix it in a separate patch.
    //
    if (!validate && blockInfo == null) {
      return false; // block is not finalized
    }
    FSVolume v = blockInfo.getBlockDataFile().getVolume();
    if (v == null) {
      DataNode.LOG.warn("No volume for block " + b);
      return false; // block is not finalized
    }
    ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b);
    if (activeFile != null) {
      if (validate) {
        File f = activeFile.getDataFile();
        if (f == null || !f.exists()) {
          // we should never get into this position.
          DataNode.LOG.warn("No temporary file " + f + " for block " + b);
        }
      }
      return false; // block is not finalized
    }
    return true; // block is finalized
  }
 
  /**
   * is this block finalized? Returns true if the block is already
   * finalized, otherwise returns false.
   */
  public boolean isBlockFinalized(int namespaceId, Block b) {
    return isBlockFinalizedInternal(namespaceId, b, false);
  }

  /**
   * is this block finalized? Returns true if the block is already
   * finalized, otherwise returns false.
   */
  private boolean isBlockFinalizedWithLock(int namespaceId, Block b) {
    lock.readLock().lock();
    try {
      return isBlockFinalizedInternal(namespaceId, b, true);
    } finally {
      lock.readLock().unlock();
    }
  }
 
  /**
   * Remove the temporary block file (if any)
   */
  public void unfinalizeBlock(int namespaceId, Block b) throws IOException {
    lock.writeLock().lock();
    try {
      // remove the block from in-memory data structure
      ActiveFile activefile = volumeMap.removeOngoingCreates(namespaceId, b);
      if (activefile == null) {
        return;
      }
      volumeMap.remove(namespaceId, b);
     
      // delete the on-disk temp file
      File metaFile = null;
      if (!activefile.isInlineChecksum()) {
        metaFile = BlockWithChecksumFileWriter.getMetaFile(
            activefile.getDataFileToRead(), b);
      }
      if (delBlockFromDisk(activefile.getDataFileToRead(), metaFile, b)) {
        DataNode.LOG.warn("Block " + b + " unfinalized and removed. " );
      }
    } finally {
      lock.writeLock().unlock();
    }
  }

  /**
   * Remove a block from disk
   * @param blockFile block file
   * @param metaFile block meta file
   * @param b a block
   * @return true if on-disk files are deleted; false otherwise
   */
  private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) {
    if (blockFile == null) {
      DataNode.LOG.warn("No file exists for block: " + b);
      return true;
    }

    if (!blockFile.delete()) {
      DataNode.LOG.warn("Not able to delete the block file: " + blockFile);
      return false;
    } else { // remove the meta file
      if (metaFile != null && !metaFile.delete()) {
        DataNode.LOG.warn(
            "Not able to delete the meta block file: " + metaFile);
        return false;
      }
    }
    return true;
  }
 
  /**
  * Return a table of blocks being written data
   * @throws IOException
  */
  public Block[] getBlocksBeingWrittenReport(int namespaceId) throws IOException {
    LightWeightHashSet<Block> blockSet = new LightWeightHashSet<Block>();
    volumes.getBlocksBeingWrittenInfo(namespaceId, blockSet);
    Block blockTable[] = new Block[blockSet.size()];
    int i = 0;
      for (Iterator<Block> it = blockSet.iterator(); it.hasNext(); i++) {
    blockTable[i] = it.next();
    }
    return blockTable;
  }
 
  /**
   * Get the list of finalized blocks from in-memory blockmap for a block pool.
   */
  public Block[] getBlockReport(int namespaceId) throws IOException {
    ArrayList<Block> ret = new ArrayList<Block>();
    org.apache.hadoop.hdfs.server.datanode.NamespaceMap nm = volumeMap
        .getNamespaceMap(namespaceId);
    if (nm == null) {
      return new Block[0];
    }
    int n = nm.getNumBucket();
    for (int i = 0; i < n; i++) {
      BlockBucket bb = nm.getBucket(i);
      bb.getBlockReport(ret);
    }
    return ret.toArray(new Block[ret.size()]);
  }

  /**
   * Check whether the given block is a valid one.
   */
  public boolean isValidBlock(int namespaceId, Block b, boolean checkSize)
      throws IOException {
    File f = null;
    ;
    try {
      f = getValidateBlockFile(namespaceId, b, checkSize);
    } catch (IOException e) {
      DataNode.LOG.warn("Block " + b + " is not valid:", e);
    }

    return ((f != null) ? isBlockFinalizedWithLock(namespaceId, b) : false);
  }
 
  public boolean isValidVolume(File currentDir) throws IOException {
    return volumes.isValidDir(currentDir);
  }
 
  /**
   * Find the file corresponding to the block and return it if it exists.
   */
  File validateBlockFile(int namespaceId, Block b) throws IOException {
    return getValidateBlockFile(namespaceId, b, false);
  }

  /**
   * Find the file corresponding to the block and return it if it exists.
   */
  File getValidateBlockFile(int namespaceId, Block b, boolean checkSize)
      throws IOException {
    //Should we check for metadata file too?
    DatanodeBlockInfo blockInfo = this.getDatanodeBlockInfo(namespaceId, b);
    File f = null;
    if (blockInfo != null) {
      if (checkSize) {
        blockInfo.verifyFinalizedSize();
      }
      f = blockInfo.getBlockDataFile().getFile();
      assert f != null;

      if(f.exists()) {
        return f;
      }
  
      // if file is not null, but doesn't exist - possibly disk failed
      datanode.checkDiskError();
    }
   
    if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
      InterDatanodeProtocol.LOG.debug("b=" + b + ", f=" + ((f == null) ? "null"
          : f));
    }
    return null;
  }

  /** {@inheritDoc} */
  public void validateBlockMetadata(int namespaceId, Block b) throws IOException {
    DatanodeBlockInfo info;
    lock.readLock().lock();
    try {
      info = volumeMap.get(namespaceId, b);
    } finally {
      lock.readLock().unlock();
    }
    if (info == null) {
      throw new IOException("Block " + b + " does not exist in volumeMap.");
    }

    File f = info.getDataFileToRead();
    // Try to find out block size
    long localBlockSize;
    if (f == null) {
      f = info.getBlockDataFile().getTmpFile(namespaceId, b);
      if (f == null) {
        throw new IOException("Block " + b + " does not exist on disk.");
      }
      if (!f.exists()) {
        throw new IOException("Block " + b +
                              " block file " + f +
                              " does not exist on disk.");
      }
      if (info.isInlineChecksum()) {
        // TODO: do we want to do it?
        localBlockSize = BlockInlineChecksumReader.getBlockSizeFromFileLength(
            f.length(), info.getChecksumType(), info.getBytesPerChecksum());
      } else {
        localBlockSize = f.length();
      }
    } else {
      if (info.isFinalized()) {
        info.verifyFinalizedSize();
        localBlockSize = info.getFinalizedSize();
      } else {
        if (info.isInlineChecksum()) {
          // TODO: do we want to do it?
          localBlockSize = BlockInlineChecksumReader
              .getBlockSizeFromFileLength(f.length(), info.getChecksumType(),
                  info.getBytesPerChecksum());
        } else {
          localBlockSize = f.length();
        }
      }
    }

    if (b.getNumBytes() > localBlockSize) {
      throw new IOException("Block " + b +
                            " length is " + b.getNumBytes()  +
                            " does not match block file length " +
                            f.length());
    }
    long stamp;
    DataChecksum dcs;
    if (!info.isInlineChecksum()) {
      File meta = BlockWithChecksumFileWriter.getMetaFile(f, b);
      if (meta == null) {
        throw new IOException("Block " + b +
                              " metafile does not exist.");
      }
      if (!meta.exists()) {
        throw new IOException("Block " + b +
                              " metafile " + meta +
                              " does not exist on disk.");
      }
      long metaFileSize = meta.length();
      if (metaFileSize == 0 && localBlockSize > 0) {
        throw new IOException("Block " + b + " metafile " + meta + " is empty.");
      }
      stamp = BlockWithChecksumFileReader.parseGenerationStampInMetaFile(f,
          meta);
      if (metaFileSize == 0) {
        // no need to check metadata size for 0 size file
        return;
      }
      dcs = BlockMetadataHeader.readHeader(meta).getChecksum();
      // verify that checksum file has an integral number of checkum values.
      int checksumsize = dcs.getChecksumSize();
      long actual = metaFileSize - BlockMetadataHeader.getHeaderSize();
      long numChunksInMeta = actual/checksumsize;
      if (actual % checksumsize != 0) {
        throw new IOException("Block " + b +
                              " has a checksum file of size " + metaFileSize +
                              " but it does not align with checksum size of " +
                              checksumsize);
      }
      int bpc = dcs.getBytesPerChecksum();
      long minDataSize = (numChunksInMeta - 1) * bpc;
      long maxDataSize = numChunksInMeta * bpc;
      if (localBlockSize > maxDataSize || localBlockSize <= minDataSize) {
        throw new IOException("Block " + b +
                              " is of size " + f.length() +
                              " but has " + (numChunksInMeta + 1) +
                              " checksums and each checksum size is " +
                              checksumsize + " bytes.");
      }
    } else {
      stamp = BlockInlineChecksumReader
          .getGenerationStampFromInlineChecksumFile(f.getName());
      if (localBlockSize == 0) {
        // no need to check metadata size for 0 size file
        return;
      }
      // TODO: What verification we can do here?
    }
    if (stamp != b.getGenerationStamp()) {
      throw new IOException("Block " + b +
                            " genstamp is " + b.getGenerationStamp()  +
                            " does not match meta file stamp " +
                            stamp);
    }
    // We could crc-check the entire block here, but it will be a costly
    // operation. Instead we rely on the above check (file length mismatch)
    // to detect corrupt blocks.
  }

  /**
   * We're informed that a block is no longer valid.  We
   * could lazily garbage-collect the block, but why bother?
   * just get rid of it.
   */
  public void invalidate(int namespaceId, Block invalidBlks[]) throws IOException {
    boolean error = false;
    for (int i = 0; i < invalidBlks.length; i++) {
      File f = null;
      FSVolume v;
      boolean inlineChecksum;
      DatanodeBlockInfo dinfo = null;
      lock.writeLock().lock();
      try {
        dinfo = volumeMap.get(namespaceId, invalidBlks[i]);
        if (dinfo == null) {
          // It is possible that after block reports, Datanodes receive
          // duplicate invalidate requests from name-node. We just skip
          // the block. In the end of the function, we don't throw an exception,
          // since no need for a disk check.
          //
          DataNode.LOG.info("Unexpected error trying to delete block "
                           + invalidBlks[i] +
                           ". BlockInfo not found in volumeMap.");
          continue;
        }
        inlineChecksum = dinfo.isInlineChecksum();
        f = dinfo.getDataFileToRead();
        v = dinfo.getBlockDataFile().getVolume();
        if (f == null) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                            + invalidBlks[i] +
                            ". Block not found in blockMap." +
                            ((v == null) ? " " : " Block found in volumeMap."));
          error = true;
          continue;
        }
        if (v == null) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                            + invalidBlks[i] +
                            ". No volume for this block." +
                            " Block found in blockMap. " + f + ".");
          error = true;
          continue;
        }
        File parent = f.getParentFile();
        if (parent == null) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                            + invalidBlks[i] +
                            ". Parent not found for file " + f + ".");
          error = true;
          continue;
        }
        //TODO ???
        v.clearPath(namespaceId, parent);
        volumeMap.remove(namespaceId, invalidBlks[i]);
      } finally {
        lock.writeLock().unlock();
      }
     
      // close the File Channel
      dinfo.getBlockDataFile().closeFileChannel();

      //rename the files to be deleted
      //for safety we add prefix instead of suffix,
      //so the valid block files still start with "blk_"
      File blockFileRenamed = new File(f.getParent()
      + File.separator + DELETE_FILE_EXT + f.getName());
     
      File metaFile = null;
      File metaFileRenamed = null;

      if (!inlineChecksum) {
        metaFile = BlockWithChecksumFileWriter.getMetaFile( f, invalidBlks[i]);
        metaFileRenamed = new File(metaFile.getParent()
      + File.separator + DELETE_FILE_EXT + metaFile.getName());
      }

      if((!f.renameTo(blockFileRenamed)) ||
          (!inlineChecksum && !metaFile.renameTo(metaFileRenamed))) {
        DataNode.LOG.warn("Unexpected error trying to delete block "
            + invalidBlks[i] +
            ". Cannot rename files for deletion.");
        error = true;
        continue;
      }

      if(invalidBlks[i].getNumBytes() != BlockFlags.NO_ACK){
        datanode.notifyNamenodeDeletedBlock(namespaceId, invalidBlks[i]);
      }
      // Delete the block asynchronously to make sure we can do it fast enough
      asyncDiskService.deleteAsync(v, blockFileRenamed, metaFileRenamed,
      invalidBlks[i].toString(), namespaceId);
    }
    if (error) {
      throw new IOException("Error in deleting blocks.");
    }
  }

  /**
   * Turn the block identifier into a filename.
   */
  public File getFile(int namespaceId, Block b) {
    lock.readLock().lock();
    try {
      DatanodeBlockInfo info = volumeMap.get(namespaceId, b);
      if (info != null) {
        return info.getDataFileToRead();
      }
      return null;
    } finally {
      lock.readLock().unlock();
    }
  }
 
  @Override
  public DatanodeBlockInfo getDatanodeBlockInfo(int namespaceId, Block b) {
    return volumeMap.get(namespaceId, b);
  }

  @Override
  public ReplicaToRead getReplicaToRead(int namespaceId, Block block) {
    lock.readLock().lock();
    try {
      ActiveFile activefile = volumeMap.getOngoingCreates(namespaceId, block);
      if (activefile != null) {
        return activefile;
      }
      DatanodeBlockInfo info = volumeMap.get(namespaceId, block);
      if (info == null) {
        if (DataNode.LOG.isDebugEnabled()) {
          DataNode.LOG.debug("volumeMap=" + volumeMap);
        }
      }
      return info;
    } finally {
      lock.readLock().unlock();
    }
  }

  /**
   * check if a data directory is healthy
   * if some volumes failed - make sure to remove all the blocks that belong
   * to these volumes
   * @throws DiskErrorException
   */
  public void checkDataDir() throws DiskErrorException {
    long total_blocks=0, removed_blocks=0;
    List<FSVolume> failed_vols = null;

    failed_vols = volumes.checkDirs();

    //if there no failed volumes return
    if(failed_vols == null)
      return;
   
    // else
    // remove related blocks
    long mlsec = System.currentTimeMillis();
    lock.writeLock().lock();
    try {
      volumeMap.removeUnhealthyVolumes(failed_vols);
    } finally {
      lock.writeLock().unlock();
    }
    mlsec = System.currentTimeMillis() - mlsec;
    DataNode.LOG.warn(">>>>>>>>>>>>Removed " + removed_blocks + " out of " + total_blocks +
        "(took " + mlsec + " millisecs)");

    // report the error
    StringBuilder sb = new StringBuilder();
    for(FSVolume fv : failed_vols) {
      sb.append(fv.toString() + ";");
    }

    throw  new DiskErrorException("DataNode failed volumes:" + sb);
 
  }
  /**
   * remove directories that are given from the list of volumes to use.
   * This function also makes sure to remove all the blocks that belong to
   * these volumes.
   */
  public void removeVolumes(Configuration conf, List<File> directories)
      throws Exception {
    if (directories == null || directories.isEmpty()) {
      DataNode.LOG.warn("There were no directories to remove. Exiting ");
      return;
    }
    List<FSVolume> volArray = null;
    lock.readLock().lock();
    try {
      volArray = volumes.removeBVolumes(directories);
    } finally {
      lock.readLock().unlock();
    }
    // remove related blocks
    long mlsec = System.currentTimeMillis();
    lock.writeLock().lock();
    try {
      volumeMap.removeUnhealthyVolumes(volArray);
    } finally {
      lock.writeLock().unlock();
    }
    mlsec = System.currentTimeMillis() - mlsec;
    DataNode.LOG.warn(">>>>>>>>>Removing these blocks took " + mlsec +
             " millisecs in refresh<<<<<<<<<<<<<<< ");
    StringBuilder sb = new StringBuilder();
    for(FSVolume fv : volArray) {
      sb.append(fv.toString() + ";");
    }
    throw new DiskErrorException("These volumes were removed: " + sb);
 
  public void addVolumes(Configuration conf, int namespaceId, String nsDir,
      Collection<StorageDirectory> dirs) throws Exception {
    if (dirs == null || dirs.isEmpty()) {
      return;
    }
    FSVolume[] volArray = new FSVolume[dirs.size()];
    File[] dirArray = new File[dirs.size()];
    int idx = 0;
    for (Iterator<StorageDirectory> iter = dirs.iterator() ; iter.hasNext(); idx++) {
      dirArray[idx] = iter.next().getCurrentDir();
      volArray[idx] = new FSVolume(this, dirArray[idx], conf);
    }

    lock.writeLock().lock();
    try {
      volumes.addVolumes(volArray);
      for (FSVolume vol : volArray) {
        vol.addNamespace(namespaceId, nsDir, conf, datanode.isSupportAppends());
      }
    } finally {
      lock.writeLock().unlock();
    }

    asyncDiskService.insertDisk(dirArray, conf);
  }


  public String toString() {
    return "FSDataset{dirpath='"+volumes+"'}";
  }

  ObjectName mbeanName;
  ObjectName versionBeanName;
  Random rand = new Random();
 
  /**
   * Register the FSDataset MBean using the name
   *        "hadoop:service=DataNode,name=FSDatasetState-<storageid>"
   */
  void registerMBean(final String storageId) {
    // We wrap to bypass standard mbean naming convetion.
    // This wraping can be removed in java 6 as it is more flexible in
    // package naming for mbeans and their impl.
    StandardMBean bean;
    String storageName;
    if (storageId == null || storageId.equals("")) {// Temp fix for the uninitialized storage
      storageName = "UndefinedStorageId" + rand.nextInt();
    } else {
      storageName = storageId;
    }
    try {
      bean = new StandardMBean(this,FSDatasetMBean.class);
      mbeanName = MBeanUtil.registerMBean("DataNode", "FSDatasetState-" + storageName, bean);
      versionBeanName = VersionInfo.registerJMX("DataNode");
    } catch (NotCompliantMBeanException e) {
      e.printStackTrace();
    }

    DataNode.LOG.info("Registered FSDatasetStatusMBean");
  }

  public void shutdown() {
    if (blockCrcMapFlusher != null) {
      blockCrcMapFlusher.setClose();
    }
    if (blockCrcMapFlusherThread != null) {
      blockCrcMapFlusherThread.interrupt();
      try {
        this.blockCrcMapFlusherThread.join();
        this.blockCrcMapFlusherThread = null;
      } catch (InterruptedException ie) {
      }
    }
    if (mbeanName != null)
      MBeanUtil.unregisterMBean(mbeanName);
    if (versionBeanName != null) {
      MBeanUtil.unregisterMBean(versionBeanName);
    }
    if (asyncDiskService != null) {
      asyncDiskService.shutdown();
    }

    if(volumes != null) {
      lock.writeLock().lock();
      try {
        if (volumes.scannersExecutor != null) {
          volumes.scannersExecutor.shutdown();
        }

        for (FSVolume volume : volumes.getVolumes()) {
          if(volume != null) {
            volume.shutdown();
          }
        }
      } finally {
        lock.writeLock().unlock();
      }
    }
  }
 
  public void addNamespace(int namespaceId, String nsDir,
      Configuration conf) throws IOException {
    DataNode.LOG.info("Adding namespace " + namespaceId);
    lock.writeLock().lock();
    try{
      volumeMap.initNamespace(namespaceId);
      volumes.addNamespace(namespaceId, nsDir, conf);
    } finally {
      lock.writeLock().unlock();
    }
   
    // Load block CRCs file files
    int numBuckets = volumeMap.getNumBuckets(namespaceId);
    for (FSVolume volume : volumes.getVolumes()) {
      try {
        File blockCrcFile = volume.getBlockCrcFile(namespaceId);
         
        if (blockCrcFile == null || !blockCrcFile.exists()) {
          continue;
        }
        int numUpdated = 0;
        FileInputStream fis = new FileInputStream(blockCrcFile);
        try {
          BlockCrcFileReader reader = new BlockCrcFileReader(
              new DataInputStream(fis));
          reader.readHeader();
          if (reader.getNumBuckets() != numBuckets) {
            // TODO: support it if needed. Now it's not clear whether we will
            // ever need it.
            DataNode.LOG
                .warn("Do not yet support loading block CRCs if bucket size changes: bucket size on disk: "
                    + reader.getNumBuckets());
          } else {
            numUpdated += volumeMap.updateBlockCrc(namespaceId, reader);
          }
        } finally {
          fis.close();
        }
        DataNode.LOG.info("Finish loading Block CRC file for namespace "
            + namespaceId + " volume " + volume + " " + numUpdated
            + " blocks' CRC updated.");
      } catch (IOException ioe) {
        DataNode.LOG.warn("IOException when try to load block CRC fle from volume"
            + volume.getDir(), ioe);
      } finally {
        volume.setNamespaceBlockCrcLoaded(namespaceId, true);
      }
    }
  }
 
  public void removeNamespace(int namespaceId){
    DataNode.LOG.info("Removing namespace " + namespaceId);
    lock.writeLock().lock();
    try{
      if (volumeMap != null) {
        volumeMap.removeNamespace(namespaceId);
      }
      if (volumes != null) {
        volumes.removeNamespace(namespaceId);
      }
    } finally {
      lock.writeLock().unlock();
    }
  }

  public String getStorageInfo() {
    return toString();
  }

  @Override
  public BlockRecoveryInfo startBlockRecovery(int namespaceId, long blockId)
      throws IOException {
    Block stored = getStoredBlock(namespaceId, blockId, true);

    if (stored == null) {
      return null;
    }

    // It's important that this loop not be synchronized - otherwise
    // this will deadlock against the thread it's joining against!
    while (true) {
      DataNode.LOG.debug(
          "Interrupting active writer threads for block " + stored);
      List<Thread> activeThreads = getActiveThreads(namespaceId, stored);
      if (activeThreads == null) break;
      if (interruptAndJoinThreads(activeThreads))
        break;
    }

    lock.readLock().lock();
    try {
      // now that writers are stopped, re-fetch the block's meta info
      stored = getStoredBlock(namespaceId, blockId, true);

      if (stored == null) {
        return null;
      }

      ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, stored);
      boolean isRecovery = (activeFile != null) && activeFile.wasRecoveredOnStartup;


      BlockRecoveryInfo info = new BlockRecoveryInfo(stored, isRecovery);
      if (DataNode.LOG.isDebugEnabled()) {
        DataNode.LOG.debug("getBlockMetaDataInfo successful block=" + stored +
                  " length " + stored.getNumBytes() +
                  " genstamp " + stored.getGenerationStamp());
      }

      // paranoia! verify that the contents of the stored block
      // matches the block file on disk.
      validateBlockMetadata(namespaceId, stored);
      return info;
    } finally {
      lock.readLock().unlock();
    }
  }

  /**
   * Copies a file as fast as possible. Tries to do a hardlink instead of a copy
   * if the hardlink parameter is specified.
   *
   * @param src
   *          the source file for copying
   * @param dst
   *          the destination file for copying
   * @param hardlink
   *          whether or not to attempt a hardlink
   * @throws IOException
   */
  public void copyFile(File src, File dst, boolean hardlink) throws IOException {

    if (src == null || dst == null) {
      throw new IOException("src/dst file is null");
    }

    try {
      if (hardlink && shouldHardLinkBlockCopy) {
        // Remove destination before hard linking, since this file might already
        // exist and a hardlink would fail as a result.
        if (dst.exists()) {
          if(!dst.delete()) {
            throw new IOException("Deletion of file : " + dst + " failed");
          }
        }
        NativeIO.link(src, dst);
        DataNode.LOG.info("Hard Link Created from : " + src + " to " + dst);
        return;
      }
    } catch (IOException e) {
      DataNode.LOG.warn("Hard link failed from : " + src + " to " + dst
          + " continuing with regular file copy");
    }

    FileChannel input = null;
    FileChannel output = null;
    try {
      // This improves copying performance a lot, it uses native buffers
      // for copying.
      input = new FileInputStream(src).getChannel();
      output = new FileOutputStream(dst).getChannel();
      if (input == null || output == null)  {
        throw new IOException("Could not create file channels for src : " + src
            + " dst : " + dst);
      }
      long bytesLeft = input.size();
      long position = 0;
      while (bytesLeft > 0) {
        long bytesWritten = output.transferFrom(input, position, bytesLeft);
        bytesLeft -= bytesWritten;
        position += bytesWritten;
      }
      if (datanode.syncOnClose) {
        output.force(true);
      }
    } finally {
      if (input != null) {
        input.close();
      }
      if (output != null) {
        output.close();
      }
    }
  }

  /**
   * Find a volume on the datanode for the destination block to be placed on.
   * It tries to place the destination block on the same volume as the source
   * block since hardlinks can be performed only between two files on the same
   * disk
   *
   * @param srcFileSystem
   *          the file system for srcBlockFile
   * @param srcNamespaceId
   *          the namespace id for srcBlock
   * @param srcBlock
   *          the source block which needs to be hardlinked
   * @param srcBlockFile
   *          the block file for srcBlock
   * @return the FSVolume on which we should put the dstBlock, null if we can't
   *         find such a volume.
   * @throws IOException
   */
  private FSVolume findVolumeForHardLink(String srcFileSystem,
      int srcNamespaceId, Block srcBlock, File srcBlockFile)
    throws IOException {
    FSVolume dstVol = null;
    if (srcBlockFile == null || !srcBlockFile.exists()) {
      throw new IOException("File " + srcBlockFile
          + " is not valid or does not have"
          + " a valid block file");
    }

    // The source file might not necessarily be a part of the FSVolumeSet of
    // this datanode, it could be part of a FSVolumeSet of another datanode on
    // the same host.
    DatanodeBlockInfo blockInfo = volumeMap.get(srcNamespaceId, srcBlock);
    if (blockInfo != null) {
      dstVol = blockInfo.getBlockDataFile().getVolume();
    } else {
      for(FSVolume volume : volumes.getVolumes()) {
        String volFileSystem = volume.getFileSystem();
        if (volFileSystem.equals(srcFileSystem)) {
          dstVol = volume;
          break;
        }
      }
    }
    return dstVol;
  }

  /**
   * Finds a volume for the dstBlock and adds the new block to the FSDataset
   * data structures to indicate we are going to start writing to the block.
   *
   * @param srcFileSystem
   *          the file system for srcBlockFile
   * @param srcBlockFile
   *          the block file for the srcBlock
   * @param srcNamespaceId
   *          the namespace id for source block
   * @param srcBlock
   *          the source block that needs to be copied over
   * @param dstNamespaceId
   *          the namespace id for destination block
   * @param dstBlock
   *          the new destination block that needs to be created for copying
   * @return returns whether or not a hardlink is possible, if hardlink was not
   *         requested this is always false.
   * @throws IOException
   */
  private boolean copyBlockLocalAdd(String srcFileSystem, File srcBlockFile,
      int srcNamespaceId, Block srcBlock, int dstNamespaceId, Block dstBlock)
      throws IOException {
    boolean hardlink = true;
    File dstBlockFile = null;
    lock.writeLock().lock();
    try {
      if (isValidBlock(dstNamespaceId, dstBlock, false) ||
          volumeMap.getOngoingCreates(dstNamespaceId, dstBlock) != null) {
        throw new BlockAlreadyExistsException("Block " + dstBlock
            + " already exists");
      }

      if (srcBlockFile == null || !srcBlockFile.exists()) {
        throw new IOException("Block " + srcBlock.getBlockName()
            + " is not valid or does not have a valid block file");
      }
      boolean inlineChecksum = Block.isInlineChecksumBlockFilename(srcBlockFile
          .getName());

      FSVolume dstVol = null;
      if (shouldHardLinkBlockCopy) {
        dstVol = findVolumeForHardLink(
            srcFileSystem, srcNamespaceId, srcBlock, srcBlockFile);
      }

      // Could not find a volume for a hard link, fall back to regular file
      // copy.
      if (dstVol == null) {
        dstVol = volumes.getNextVolume(srcBlock.getNumBytes());
        hardlink = false;
      }

      int checksumType = DataChecksum.CHECKSUM_UNKNOWN;
      int bytesPerChecksum = -1;
      if (inlineChecksum) {
        GenStampAndChecksum sac = BlockInlineChecksumReader
            .getGenStampAndChecksumFromInlineChecksumFile(srcBlockFile
                .getName());
        checksumType = sac.checksumType;
        bytesPerChecksum = sac.bytesPerChecksum;
      }

      List<Thread> threads = null;
      // We do not want to create a BBW, hence treat this as a replication
      // request.
      dstBlockFile = createTmpFile(dstNamespaceId, dstVol, dstBlock, true,
          inlineChecksum, checksumType, bytesPerChecksum);
      DatanodeBlockInfo binfo = new DatanodeBlockInfo(dstVol, dstBlockFile,
          DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType,
          bytesPerChecksum, false, 0);
      volumeMap.add(dstNamespaceId, dstBlock, binfo);
      volumeMap.addOngoingCreates(dstNamespaceId, dstBlock, new ActiveFile(
          binfo, threads, ActiveFile.UNKNOWN_SIZE, false));     
    } finally {
      lock.writeLock().unlock();
    }

    if (dstBlockFile == null) {
      throw new IOException("Could not allocate block file for : " +
          dstBlock.getBlockName());
    }
    return hardlink;
  }

  /**
   * Finalize the block in FSDataset.
   *
   * @param dstNamespaceId
   *          the namespace id for dstBlock
   * @param dstBlock
   *          the block that needs to be finalized
   * @param dstBlockFile
   *          the block file for the block that has to be finalized
   * @throws IOException
   */
  private void copyBlockLocalFinalize(int dstNamespaceId,
      Block dstBlock, File dstBlockFile)
    throws IOException {
    boolean inlineChecksum = Block.isInlineChecksumBlockFilename(dstBlockFile
        .getName());
    long blkSize = 0;
    long fileSize = dstBlockFile.length();
    lock.writeLock().lock();
    try {
      DatanodeBlockInfo info = volumeMap.get(dstNamespaceId, dstBlock);
      if (info == null) {
        throw new IOException("Could not find information for " + dstBlock);
      }
      if (inlineChecksum) {
        blkSize = BlockInlineChecksumReader.getBlockSizeFromFileLength(fileSize,
            info.getChecksumType(), info.getBytesPerChecksum());
      } else {
        blkSize = fileSize;
      }

      FSVolume dstVol = info.getBlockDataFile().getVolume();
      // Finalize block on disk.
      File dest = dstVol.addBlock(dstNamespaceId, dstBlock, dstBlockFile,
          info.isInlineChecksum(), info.getChecksumType(),
          info.getBytesPerChecksum());
      volumeMap.add(dstNamespaceId, dstBlock,
          new DatanodeBlockInfo(dstVol, dest, blkSize, true, inlineChecksum,
              info.getChecksumType(), info.getBytesPerChecksum(), false, 0));
      volumeMap.removeOngoingCreates(dstNamespaceId, dstBlock);
    } finally {
      lock.writeLock().unlock();
    }
  }

  /** {@inheritDoc} */
  @Override
  public void copyBlockLocal(String srcFileSystem, File srcBlockFile,
      int srcNamespaceId, Block srcBlock, int dstNamespaceId, Block dstBlock)
      throws IOException {
    File dstBlockFile = null;
    try {
      boolean hardlink = copyBlockLocalAdd(srcFileSystem, srcBlockFile,
          srcNamespaceId, srcBlock, dstNamespaceId, dstBlock);

      DatanodeBlockInfo binfo = volumeMap.get(dstNamespaceId, dstBlock);
      dstBlockFile = binfo.getDataFileToRead();

      // Copy files.
      copyFile(srcBlockFile, dstBlockFile, hardlink);

      // Copy metafile.
      if (!binfo.isInlineChecksum()) {
        File metaFileSrc = BlockWithChecksumFileWriter.getMetaFile(srcBlockFile, srcBlock);
        File metaFileDst = BlockWithChecksumFileWriter.getMetaFile(dstBlockFile, dstBlock);
        copyFile(metaFileSrc, metaFileDst, hardlink);
      }

      // Finalize block
      copyBlockLocalFinalize(dstNamespaceId, dstBlock, dstBlockFile);
    } catch (BlockAlreadyExistsException be) {
      throw be;
    } catch (IOException e) {
      unfinalizeBlock(dstNamespaceId, dstBlock);
      throw e;
    }
  }

  /** {@inheritDoc} */
  @Override
  public String getFileSystemForBlock(int namespaceId, Block block) throws IOException {
    if (!isValidBlock(namespaceId, block, false)) {
      throw new IOException("Invalid block");
    }
    return volumeMap.get(namespaceId, block).getBlockDataFile().getVolume()
        .getFileSystem();
  }

  static File createTmpFile(Block b, File f) throws IOException {
    if (f.exists()) {
      throw new IOException("Unexpected problem in creating temporary file for "+
                            b + ".  File " + f + " should not be present, but is.");
    }
    // Create the zero-length temp file
    //
    boolean fileCreated = false;
    try {
      fileCreated = f.createNewFile();
    } catch (IOException ioe) {
      throw (IOException)new IOException(DISK_ERROR +f).initCause(ioe);
    }
    if (!fileCreated) {
      throw new IOException("Unexpected problem in creating temporary file for "+
                            b + ".  File " + f + " should be creatable, but is already present.");
    }
    return f;
  }

  @Override
  public long size(int namespaceId) {
    try {
      return volumeMap.size(namespaceId);
    } catch (Exception e) {
      return -1;
    }
  } 
 
  /**
   * Reconcile the difference between blocks on the disk and blocks in
   * volumeMap
   *
   * Check the given block for inconsistencies. Look at the
   * current state of the block and reconcile the differences as follows:
   * <ul>
   * <li>If the block file is missing, delete the block from volumeMap</li>
   * <li>If the block file exists and the block is missing in volumeMap,
   * add the block to volumeMap <li>
   * <li>If generation stamp does not match, then update the block with right
   * generation stamp</li>
   * <li>If the block length in memory does not match the actual block file length
   * then mark the block as corrupt and update the block length in memory</li>
   * <li>If the file in {@link ReplicaInfo} does not match the file on
   * the disk, update {@link ReplicaInfo} with the correct file</li>
   * </ul>
   *
   * @param blockId Block that differs
   * @param diskFile Block file on the disk
   * @param diskMetaFile Metadata file from on the disk
   * @param vol Volume of the block file
   */
  public void checkAndUpdate(Integer nsid, FSDatasetDelta delta,
      ScanDifference info) throws IOException {

    long blockId = info.getBlockId();

    lock.writeLock().lock();
    try {
      // we don't want delta to record changes we do during reconciliation
      delta.stopRecordingDelta();

      if (delta.get(nsid, blockId) != null) {
        // FIXME Presence of the block in delta means that it was changed
        // somehow
        // during the interval of time right after the difference computation in
        // directory scanner and before acquiring of writeLock in this method.
        // We can probably go through different operations that could happen
        // with the block
        // and write some logic for each of them, but this adds lots of
        // complexity. Instead
        // we just skip reconciliation for the block at this time. If it has
        // problems we're likely
        // to solve them next time
        return;
      }
      Block memBlock = new Block(blockId, 0, GenerationStamp.WILDCARD_STAMP);
      DatanodeBlockInfo memBlockInfo = volumeMap.get(nsid, memBlock);
      if (memBlockInfo != null && !memBlockInfo.isFinalized()) {
        // Block is not finalized - ignore the difference
        return;
      }

      // We don't have any files for this block on disk
      if (info.getState() == ScanDifference.DISK_FILES_MISSING) {
        if (memBlockInfo == null) {
          return;
        }
        volumeMap.remove(nsid, memBlock);
        LOG.info("checkAndUpdate: removing block: " + memBlock
            + " for namespace: " + nsid);
        if (datanode.blockScanner != null) {
          datanode.blockScanner.deleteBlock(nsid, memBlock);
        }
        return;
      }

      // We dont' have block in memory, but have some of its files on disk
      if (info.getState() == ScanDifference.MEMORY_BLOCK_MISSING) {
        // if there's a block file, then add it to volumeMap, otherwise
        // remove metaFile if any
        if (info.getBlockFile() != null) {
          Block newBlock = new Block(blockId, info.getLength(),
              info.getGenStamp());
          boolean isInlineChecksum = info.isInlineChecksum();
          DatanodeBlockInfo diskBlockInfo = null;
          if (isInlineChecksum) {
            GenStampAndChecksum sac = BlockInlineChecksumReader
                .getGenStampAndChecksumFromInlineChecksumFile(info
                    .getBlockFile().getName());
            diskBlockInfo = new DatanodeBlockInfo(info.getVolume(),
                info.getBlockFile(), info.getLength(), true, true,
                sac.checksumType, sac.bytesPerChecksum, false, 0);
          } else {
            diskBlockInfo = new DatanodeBlockInfo(info.getVolume(),
                info.getBlockFile(), info.getLength(), true, false,
                DataChecksum.CHECKSUM_UNKNOWN, -1, false, 0);
          }
          volumeMap.add(nsid, newBlock, diskBlockInfo);
          LOG.info("checkAndUpdate: adding block: " + newBlock
              + " for namespace: " + nsid + " size: "
              + diskBlockInfo.getBytesVisible());
          if (datanode.blockScanner != null) {
            datanode.blockScanner.addBlock(nsid, newBlock);
          }
        } else {
          // scheduling a file for deletion
          asyncDiskService
              .deleteAsyncFile(info.getVolume(), info.getMetaFile());
        }
        return;
      }

      // We have this block in memory and some of its files on disk
      if (info.getState() == ScanDifference.OUT_OF_SYNC) {
        if (info.getBlockFile() == null) {
          volumeMap.remove(nsid, memBlock);
          LOG.info("checkAndUpdate: removing block: " + memBlock
              + " for namespace: " + nsid);
          if (datanode.blockScanner != null) {
            datanode.blockScanner.deleteBlock(nsid, memBlock);
          }
          // scheduling a file for deletion
          asyncDiskService
              .deleteAsyncFile(info.getVolume(), info.getMetaFile());
        } else {
          if (memBlockInfo == null) {
            return;
          }
          memBlockInfo.getBlock().setNumBytes(info.getLength());
          memBlockInfo.getBlock().setGenerationStamp(info.getGenStamp());
          LOG.info("checkAndUpdate: updating block: " + memBlockInfo
              + " for namespace: " + nsid);
        }
        return;
      }
    } finally {
      try {
        delta.startRecordingDelta();
      } finally {
        lock.writeLock().unlock();
      }
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.datanode.FSDataset$NamespaceSlice

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.