Source Code of org.apache.hadoop.hdfs.TestFileAppend4$DelayAnswer

package org.apache.hadoop.hdfs;


import junit.framework.TestCase;




import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.lang.reflect.Method;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.*;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSOutputStream;
import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.FSConstants.SafeModeAction;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
import org.apache.hadoop.hdfs.server.datanode.FSDataset;
import org.apache.hadoop.hdfs.server.datanode.FSDatasetTestUtil;
import org.apache.hadoop.hdfs.server.datanode.NameSpaceSliceStorage;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
import org.apache.hadoop.hdfs.server.namenode.FSImageAdapter;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import static org.apache.hadoop.hdfs.AppendTestUtil.loseLeases;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;


import static org.mockito.Matchers.anyBoolean;
import static org.mockito.Matchers.anyInt;
import static org.mockito.Matchers.anyObject;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.anyString;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.spy;


/* File Append tests for HDFS-200 & HDFS-142, specifically focused on:
 *  using append()/sync() to recover block information
 */
public class TestFileAppend4 extends TestCase {
  static final Log LOG = LogFactory.getLog(TestFileAppend4.class);
  static final long BLOCK_SIZE = 1024;
  static final long BBW_SIZE = 500; // don't align on bytes/checksum


  static final Object [] NO_ARGS = new Object []{};


  Configuration conf;
  MiniDFSCluster cluster;
  Path file1;
  FSDataOutputStream stm;
  boolean simulatedStorage = false;


  {
    ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
    ((Log4JLogger)LeaseManager.LOG).getLogger().setLevel(Level.ALL);
    ((Log4JLogger)FSNamesystem.LOG).getLogger().setLevel(Level.ALL);
    ((Log4JLogger)DataNode.LOG).getLogger().setLevel(Level.ALL);
    ((Log4JLogger)DFSClient.LOG).getLogger().setLevel(Level.ALL);
  }


  @Override
  public void setUp() throws Exception {
    this.conf = new Configuration();
    if (simulatedStorage) {
      conf.setBoolean(SimulatedFSDataset.CONFIG_PROPERTY_SIMULATED, true);
    }
    conf.setBoolean("dfs.support.append", true);


    // lower heartbeat interval for fast recognition of DN death
    conf.setInt("heartbeat.recheck.interval", 1000);
    conf.setInt("dfs.heartbeat.interval", 1);
    conf.setInt("dfs.socket.timeout", 5000);
    // handle under-replicated blocks quickly (for replication asserts)
//    conf.set("dfs.replication.pending.timeout.sec", Integer.toString(5));
    conf.setInt("dfs.replication.pending.timeout.sec", 5);
    conf.setInt("dfs.replication.interval", 1);
    // handle failures in the DFSClient pipeline quickly
    // (for cluster.shutdown(); fs.close() idiom)
    conf.setInt("ipc.client.connect.max.retries", 1);
    conf.setInt("dfs.client.block.recovery.retries", 1);
    // Delay blockReceived calls from DNs to be more similar to a real
    // cluster. 10ms is enough so that client often gets there first.
    conf.setInt("dfs.datanode.artificialBlockReceivedDelay", 10);
  }


  @Override
  public void tearDown() throws Exception {


  }


  private void createFile(FileSystem whichfs, String filename,
                  int rep, long fileSize) throws Exception {
    file1 = new Path(filename);
    stm = whichfs.create(file1, true, (int)fileSize+1, (short)rep, BLOCK_SIZE);
    LOG.info("Created file " + filename);
    LOG.info("Writing " + fileSize + " bytes to " + file1);
    AppendTestUtil.write(stm, 0, (int)fileSize);
  }


  private void assertFileSize(FileSystem whichfs, long expectedSize) throws Exception {
    LOG.info("reading length of " + file1.getName() + " on namenode");
    long realSize = whichfs.getFileStatus(file1).getLen();
    assertTrue("unexpected file size! received=" + realSize
                                + " , expected=" + expectedSize,
               realSize == expectedSize);
  }


  private void assertNumCurrentReplicas(short rep) throws Exception {
    OutputStream hdfs_out = stm.getWrappedStream();
    Method r = hdfs_out.getClass().getMethod("getNumCurrentReplicas",
                                             new Class<?> []{});
    r.setAccessible(true);
    int actualRepl = ((Integer)r.invoke(hdfs_out, NO_ARGS)).intValue();
    assertTrue(file1 + " should be replicated to " + rep + " datanodes, not " +
               actualRepl + ".", actualRepl == rep);
  }


  /*
   * Recover file.
   * Try and open file in append mode.
   * Doing this, we get a hold of the file that crashed writer
   * was writing to.  Once we have it, close it.  This will
   * allow subsequent reader to see up to last sync.
   * NOTE: This is the same algorithm that HBase uses for file recovery
   * @param fs
   * @throws Exception
   */
  private void recoverFile(final FileSystem fs) throws Exception {
    LOG.info("Recovering File Lease");
    AppendTestUtil.recoverFile(cluster, fs, file1);
    LOG.info("Past out lease recovery");
  }


  // Waits for all of the blocks to have expected replication
  private void waitForBlockReplication(FileSystem whichfs, String filename,
                                       int expected, long maxWaitSec)
                                       throws IOException {
    long start = System.currentTimeMillis();


    //wait for all the blocks to be replicated;
    LOG.info("Checking for block replication for " + filename);
    int iters = 0;
    while (true) {
      boolean replOk = true;


      BlockLocation[] bl = whichfs.getFileBlockLocations(
          whichfs.getFileStatus(file1), 0, BLOCK_SIZE);
      if(bl.length == 0) {
        replOk = false;
      }
      for (BlockLocation b : bl) {


        int actual = b.getNames().length;
        if ( actual < expected ) {
          LOG.info("Not enough replicas for " + b +
              " yet. Expecting " + expected + ", got " +
              actual + ".");
          replOk = false;
          break;
        }
      }


      if (replOk) {
        return;
      }


      iters++;


      if (maxWaitSec > 0 &&
          (System.currentTimeMillis() - start) > (maxWaitSec * 1000)) {
        throw new IOException("Timedout while waiting for all blocks to " +
                              " be replicated for " + filename);
      }


      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
  }


  private void checkFile(FileSystem whichfs, long fileSize) throws Exception {
    LOG.info("validating content from datanodes...");
    AppendTestUtil.check(whichfs, file1, fileSize);
  }


  enum CorruptionType {
    CORRUPT_LAST_CHUNK,
    TRUNCATE_BLOCK_TO_ZERO,
    TRUNCATE_BLOCK_HALF;
  }


  /**
   * Corrupt all of the blocks in the blocksBeingWritten dir
   * for the specified datanode number. The corruption is
   * specifically the last checksum chunk of the file being
   * modified by writing random data into it.
   */
  private void corruptDataNode(int dnNumber, CorruptionType type) throws Exception {
    // get the FS data of the specified datanode
    File ns_dir = cluster.getBlockDirectory("data" + Integer.toString(dnNumber*2 + 1))
        .getParentFile().getParentFile();
    File data_dir = new File(ns_dir, MiniDFSCluster.RBW_DIR_NAME);
    int corrupted = 0;
    for (File block : data_dir.listFiles()) {
      // only touch the actual data, not the metadata (with CRC)
      if (block.getName().startsWith("blk_") && 
         !block.getName().endsWith("meta")) {
        if (type == CorruptionType.CORRUPT_LAST_CHUNK) {
          RandomAccessFile file = new RandomAccessFile(block, "rw");
          FileChannel channel = file.getChannel();
          Random r = new Random();
          long lastBlockSize = channel.size() % 512;
          long position = channel.size() - lastBlockSize;
          int length = r.nextInt((int)(channel.size() - position + 1));
          byte[] buffer = new byte[length];
          r.nextBytes(buffer);




          channel.write(ByteBuffer.wrap(buffer), position);
          System.out.println("Deliberately corrupting file " + block.getName() +
                             " at offset " + position +
                             " length " + length);
          file.close();


        } else if (type == CorruptionType.TRUNCATE_BLOCK_TO_ZERO) {
          LOG.info("Truncating block file at " + block);
          RandomAccessFile blockFile = new RandomAccessFile(block, "rw");
          blockFile.setLength(0);
          blockFile.close();


          RandomAccessFile metaFile = new RandomAccessFile(
              FSDataset.findMetaFile(block), "rw");
          metaFile.setLength(0);
          metaFile.close();
        } else if (type == CorruptionType.TRUNCATE_BLOCK_HALF) {
          FSDatasetTestUtil.truncateBlockFile(block, block.length() / 2);
        } else {
          assert false;
        }
        ++corrupted;
      }
    }
    assertTrue("Should have some data in bbw to corrupt", corrupted > 0);
  }
  
  // test [1 bbw, 0 HDFS block]
  public void testAppendSyncBbw() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    FileSystem fs1 = cluster.getFileSystem();;
    FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
    try {
      createFile(fs1, "/bbw.test", 1, BBW_SIZE);
      stm.sync();
      // empty before close()
      assertFileSize(fs1, 1); 
      loseLeases(fs1);
      recoverFile(fs2);
      // close() should write recovered bbw to HDFS block
      assertFileSize(fs2, BBW_SIZE);
      checkFile(fs2, BBW_SIZE);
    } finally {
      fs2.close();
      fs1.close();
      cluster.shutdown();
    }
    LOG.info("STOP");
  }


  // test [1 bbw, 0 HDFS block] with cluster restart
  public void testAppendSyncBbwClusterRestart() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    FileSystem fs2 = null;
    try {
      createFile(fs1, "/bbwRestart.test", 1, BBW_SIZE);
      stm.sync();
      // empty before close()
      assertFileSize(fs1, 1); 


      fs1.close(); // same as: loseLeases()
      cluster.shutdown();
      LOG.info("STOPPED first instance of the cluster");


      cluster = new MiniDFSCluster(conf, 1, false, null);
      cluster.waitActive();
      LOG.info("START second instance.");


      fs2 = cluster.getFileSystem();


      recoverFile(fs2);


      // close() should write recovered bbw to HDFS block
      assertFileSize(fs2, BBW_SIZE);
      checkFile(fs2, BBW_SIZE);


    } finally {
      if(fs2 != null) {
        fs2.close();
      }
      fs1.close();
      cluster.shutdown();
    }
    LOG.info("STOP");
  }


  /**
   * Test case that stops a writer after finalizing a block but
   * before calling completeFile, and then tries to recover
   * the lease.
   */
  public void testRecoverFinalizedBlock() throws Throwable {
    cluster = new MiniDFSCluster(conf, 3, true, null);


    try {
      cluster.waitActive();
      NameNode preSpyNN = cluster.getNameNode();
      NameNode spyNN = spy(preSpyNN);


      // Delay completeFile
      DelayAnswer delayer = new DelayAnswer();
      doAnswer(delayer).when(spyNN).complete(anyString(), anyString(), anyLong(), 
          (Block)anyObject());


      DFSClient client = new DFSClient(null, spyNN, conf, null);
      file1 = new Path("/testRecoverFinalized");
      final OutputStream stm = client.create("/testRecoverFinalized", true);


      // write 1/2 block
      AppendTestUtil.write(stm, 0, 4096);
      final AtomicReference<Throwable> err = new AtomicReference<Throwable>();
      Thread t = new Thread() {
          public void run() {
            try {
              stm.close();
            } catch (Throwable t) {
              err.set(t);
            }
          }};
      t.start();
      LOG.info("Waiting for close to get to latch...");
      delayer.waitForCall();


      // At this point, the block is finalized on the DNs, but the file
      // has not been completed in the NN.
      // Lose the leases
      LOG.info("Killing lease checker");
      client.leasechecker.interruptAndJoin();


      FileSystem fs1 = cluster.getFileSystem();
      FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(
        fs1.getConf());


      LOG.info("Recovering file");
      recoverFile(fs2);


      LOG.info("Telling close to proceed.");
      delayer.proceed();
      LOG.info("Waiting for close to finish.");
      t.join();
      LOG.info("Close finished.");


      // We expect that close will get a last block mismatch error
      // because the recover will update the block generation stamp in 
      // the namenode which mismatches that known by the old client 
      Throwable thrownByClose = err.get();
      assertNotNull(thrownByClose);
      assertTrue(thrownByClose.getMessage().
          startsWith("Try close a closed file: last block from " + 
              "client side doesn't match name-node"));
    } finally {
      cluster.shutdown();
    }
  }


  /**
   * Test for an intermittent failure of commitBlockSynchronization.
   * This could happen if the DN crashed between calling updateBlocks
   * and commitBlockSynchronization.
   */
  public void testDatanodeFailsToCommit() throws Throwable {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    FileSystem fs1 = cluster.getFileSystem();;
    FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
    try {
      createFile(fs1, "/datanodeFailsCommit.test", 1, BBW_SIZE);
      stm.sync();
      loseLeases(fs1);


      // Make the NN fail to commitBlockSynchronization one time
      NameNode nn = cluster.getNameNode();
      nn.namesystem = spy(nn.namesystem);
      doAnswer(new ThrowNTimesAnswer(IOException.class, 1)).
        when(nn.namesystem).
        commitBlockSynchronization((Block)anyObject(), anyInt(), anyInt(),
                                   anyBoolean(), anyBoolean(),
                                   (DatanodeID[])anyObject());


      recoverFile(fs2);
      // close() should write recovered bbw to HDFS block
      assertFileSize(fs2, BBW_SIZE);
      checkFile(fs2, BBW_SIZE);
    } finally {
      fs2.close();
      fs1.close();
      cluster.shutdown();
    }
    LOG.info("STOP");
  }


  /**
   * Test that when a DN starts up with bbws from a file that got
   * removed or finalized when it was down, the block gets deleted.
   */
  public void testBBWCleanupOnStartup() throws Throwable {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      int halfBlock = (int) BLOCK_SIZE / 2;
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE % 4 == 0);


      file1 = new Path("/bbwCleanupOnStartup.dat");


      // write 1/2 block & sync
      stm = fs1.create(file1, true, (int) BLOCK_SIZE * 2, rep, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, halfBlock);
      stm.sync();


      String dataDirs = cluster.getDataNodes().get(0).getConf().get("dfs.data.dir");
      // close one of the datanodes
      MiniDFSCluster.DataNodeProperties dnprops = cluster.stopDataNode(0);


      stm.close();


      List<File> bbwFilesAfterShutdown = getBBWFiles(dataDirs);
      assertEquals(1, bbwFilesAfterShutdown.size());


      assertTrue(cluster.restartDataNode(dnprops));


      List<File> bbwFilesAfterRestart = null;
      // Wait up to 10 heartbeats for the files to get removed - it should
      // really happen after just a couple.
      for (int i = 0; i < 10; i++) {
        LOG.info("Waiting for heartbeat #" + i + " after DN restart");
        cluster.waitForDNHeartbeat(0, 10000);


        // Check if it has been deleted
        bbwFilesAfterRestart = getBBWFiles(dataDirs);
        if (bbwFilesAfterRestart.size() == 0) {
          break;
        }
      }


      assertEquals(0, bbwFilesAfterRestart.size());


    } finally {
      fs1.close();
      cluster.shutdown();
    }
  }


  /**
   * Test for a race in appendFile where the file might get removed in between
   * the two synchronized sections.
   */
  public void testAppendFileRace() throws Throwable {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    final FileSystem fs1 = cluster.getFileSystem();;


    try {
      createFile(fs1, "/testAppendFileRace", 1, BBW_SIZE);
      stm.close();


      NameNode nn = cluster.getNameNode();
      FSEditLog editLogSpy = FSImageAdapter.injectEditLogSpy(nn.getNamesystem());
      DelayAnswer  delayer = new DelayAnswer();
      doAnswer(delayer).when(editLogSpy).logSync();


      final AtomicReference<Throwable> err = new AtomicReference<Throwable>();
      Thread appender = new Thread() {
          public void run() {
            try {
              stm = fs1.append(file1);
            } catch (Throwable t) {
              err.set(t);
            }
          }
        };
      LOG.info("Triggering append in other thread");
      appender.start();


      LOG.info("Waiting for logsync");
      delayer.waitForCall();


      LOG.info("Resetting spy");
      reset(editLogSpy);


      LOG.info("Deleting file");
      fs1.delete(file1, true);


      LOG.info("Allowing append to proceed");
      delayer.proceed();


      LOG.info("Waiting for append to finish");


      appender.join();


      if (err.get() != null) {
        if (err.get().getMessage().contains(
              "File does not exist.")) {
          LOG.info("Got expected exception", err.get());
        } else {
          throw err.get();
        }
      }
      LOG.info("Closing stream");
      stm.close();
    } finally {
      fs1.close();
      cluster.shutdown();
    }
  }


  private List<File> getBBWFiles(String dfsDataDirs) {
    ArrayList<File> files = new ArrayList<File>();
    for (String dirString : dfsDataDirs.split(",")) {
      int nsId = cluster.getNameNode(0).getNamespaceID();
      File curDataDir = new File(dirString, MiniDFSCluster.CURRENT_DIR_NAME);
      File dir = NameSpaceSliceStorage.getNsRoot(nsId, curDataDir);
      File finalizedDir = new File(dir, MiniDFSCluster.FINALIZED_DIR_NAME);
      assertTrue("data dir " + finalizedDir + " should exist",
          finalizedDir.exists());
      File bbwDir = new File(dir, MiniDFSCluster.RBW_DIR_NAME);
      assertTrue("bbw dir " + bbwDir + " should eixst",
        bbwDir.exists());
      for (File blockFile : bbwDir.listFiles()) {
        if (!blockFile.getName().endsWith(".meta")) {
          files.add(blockFile);
        }
      }
    }
    return files;
  }


  /**
   * Test for following sequence:
   * 1. Client finishes writing a block, but does not allocate next one
   * 2. Client loses lease
   * 3. Recovery process starts, but commitBlockSynchronization not called yet
   * 4. Client calls addBlock and continues writing
   * 5. commitBlockSynchronization proceeds
   * 6. Original client tries to write/close
   */
  public void testRecoveryOnBlockBoundary() throws Throwable {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    ;
    final FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());


    // Allow us to delay commitBlockSynchronization
    DelayAnswer delayer = new DelayAnswer();
    NameNode nn = cluster.getNameNode();
    nn.namesystem = spy(nn.namesystem);
    doAnswer(delayer).
      when(nn.namesystem).
      commitBlockSynchronization((Block) anyObject(), anyInt(), anyInt(),
        anyBoolean(), anyBoolean(),
        (DatanodeID[]) anyObject());


    try {
      file1 = new Path("/testWritingDuringRecovery.test");
      stm = fs1.create(file1, true, (int) BLOCK_SIZE * 2, (short) 3, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, (int) (BLOCK_SIZE));
      stm.sync();


      LOG.info("Losing lease");
      loseLeases(fs1);




      LOG.info("Triggering recovery in another thread");


      final AtomicReference<Throwable> err = new AtomicReference<Throwable>();
      Thread recoverThread = new Thread() {
        public void run() {
          try {
            recoverFile(fs2);
          } catch (Throwable t) {
            err.set(t);
          }
        }
      };
      recoverThread.start();


      LOG.info("Waiting for recovery about to call commitBlockSynchronization");
      delayer.waitForCall();


      LOG.info("Continuing to write to stream");
      AppendTestUtil.write(stm, 0, (int) (BLOCK_SIZE));
      try {
        stm.sync();
        fail("Sync was allowed after recovery started");
      } catch (IOException ioe) {
        LOG.info("Got expected IOE trying to write to a file from the writer " +
          "that lost its lease", ioe);
      }


      LOG.info("Written more to stream, allowing commit to proceed");
      delayer.proceed();


      LOG.info("Joining on recovery thread");
      recoverThread.join();
      if (err.get() != null) {
        throw err.get();
      }


      LOG.info("Now that recovery has finished, still expect further writes to fail.");
      try {
        AppendTestUtil.write(stm, 0, (int) (BLOCK_SIZE));
        stm.sync();
        fail("Further writes after recovery finished did not fail!");
      } catch (IOException ioe) {
        LOG.info("Got expected exception", ioe);
      }




      LOG.info("Checking that file looks good");


      // close() should write recovered only the first successful
      // writes
      assertFileSize(fs2, BLOCK_SIZE);
      checkFile(fs2, BLOCK_SIZE);
    } finally {
      try {
        fs2.close();
        fs1.close();
        cluster.shutdown();
      } catch (Throwable t) {
        LOG.warn("Didn't close down cleanly", t);
      }
    }
    LOG.info("STOP");
  }


  /**
   * Mockito answer helper that will throw an exception a given number
   * of times before eventually succeding.
   */
  private static class ThrowNTimesAnswer implements Answer {
    private int numTimesToThrow;
    private Class<? extends Throwable> exceptionClass;


    public ThrowNTimesAnswer(Class<? extends Throwable> exceptionClass,
                             int numTimesToThrow) {
      this.exceptionClass = exceptionClass;
      this.numTimesToThrow = numTimesToThrow;
    }


    public Object answer(InvocationOnMock invocation) throws Throwable {
      if (numTimesToThrow-- > 0) {
        throw exceptionClass.newInstance();
      }


      return invocation.callRealMethod();
    }
  }


  // test [3 bbw, 0 HDFS block] with cluster restart
  // ** previous HDFS-142 patches hit an problem with multiple outstanding bbw on a single disk**
  public void testAppendSync2XBbwClusterRestart() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    // assumption: this MiniDFS starts up 1 datanode with 2 dirs to load balance
    assertTrue(cluster.getDataNodes().get(0).getConf().get("dfs.data.dir").matches("[^,]+,[^,]*"));
    FileSystem fs1 = cluster.getFileSystem();
    FileSystem fs2 = null;
    try {
      // create 3 bbw files [so at least one dir has 2 files]
      int[] files = new int[]{0,1,2};
      Path[] paths = new Path[files.length];
      FSDataOutputStream[] stms = new FSDataOutputStream[files.length];
      for (int i : files ) {
        createFile(fs1, "/bbwRestart" + i + ".test", 1, BBW_SIZE);
        stm.sync();
        assertFileSize(fs1, 1);
        paths[i] = file1;
        stms[i] = stm;
      }


      fs1.close(); // same as: loseLeases()
      cluster.shutdown();
      LOG.info("STOPPED first instance of the cluster");


      cluster = new MiniDFSCluster(conf, 1, false, null);
      cluster.waitActive();
      LOG.info("START second instance.");


      fs2 = cluster.getFileSystem();


      // recover 3 bbw files
      for (int i : files) {
        file1 = paths[i];
        recoverFile(fs2);
        assertFileSize(fs2, BBW_SIZE);
        checkFile(fs2, BBW_SIZE);
      }
    } finally {
      if(fs2 != null) {
        fs2.close();
      }
      fs1.close();
      cluster.shutdown();
    }
    LOG.info("STOP");
  }
  // test [1 bbw, 1 HDFS block]
  public void testAppendSyncBlockPlusBbw() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 1, true, null);
    FileSystem fs1 = cluster.getFileSystem();;
    FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
    try {
      createFile(fs1, "/blockPlusBbw.test", 1, BLOCK_SIZE + BBW_SIZE);
      // 0 before sync()
      assertFileSize(fs1, 0); 
      stm.sync();
      // make sure a block received notification is reported
      while (fs1.getFileStatus(file1).getLen() == 0) {
        Thread.sleep(100);
      }
      // BLOCK_SIZE after sync()
      assertFileSize(fs1, BLOCK_SIZE+1); 
      loseLeases(fs1);
      recoverFile(fs2);
      // close() should write recovered bbw to HDFS block
      assertFileSize(fs2, BLOCK_SIZE + BBW_SIZE);
      checkFile(fs2, BLOCK_SIZE + BBW_SIZE);
    } finally {
      stm = null;
      fs2.close();
      fs1.close();
      cluster.shutdown();
    }
    LOG.info("STOP");
  }


  // we test different datanodes restarting to exercise
  // the start, middle, & end of the DFSOutputStream pipeline
  public void testAppendSyncReplication0() throws Exception {
    replicationTest(0);
  }
  public void testAppendSyncReplication1() throws Exception {
    replicationTest(1);
  }
  public void testAppendSyncReplication2() throws Exception {
    replicationTest(2);
  }


  void replicationTest(int badDN) throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      int halfBlock = (int)BLOCK_SIZE/2;
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE%4 == 0);


      file1 = new Path("/appendWithReplication.dat");


      // write 1/2 block & sync
      stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, halfBlock);
      stm.sync();
      assertNumCurrentReplicas(rep);


      // close one of the datanodes
      cluster.stopDataNode(badDN);


      // write 1/4 block & sync
      AppendTestUtil.write(stm, halfBlock, (int)BLOCK_SIZE/4);
      stm.sync();
      assertNumCurrentReplicas((short)(rep - 1));


      // restart the cluster
      /*
       * we put the namenode in safe mode first so he doesn't process
       * recoverBlock() commands from the remaining DFSClient as datanodes
       * are serially shutdown
       */
      cluster.getNameNode().setSafeMode(SafeModeAction.SAFEMODE_ENTER);
      fs1.close();
      cluster.shutdown();
      LOG.info("STOPPED first instance of the cluster");
      cluster = new MiniDFSCluster(conf, 3, false, null);
      cluster.getNameNode().getNamesystem().stallReplicationWork();
      cluster.waitActive();
      fs1 = cluster.getFileSystem();
      LOG.info("START second instance.");


      recoverFile(fs1);
      LOG.info("Recovered file");


      // the 2 DNs with the larger sequence number should win
      BlockLocation[] bl = fs1.getFileBlockLocations(
          fs1.getFileStatus(file1), 0, BLOCK_SIZE);
      LOG.info("Checking blocks");
      assertTrue("Should have one block", bl.length == 1);


      // Wait up to 1 second for block replication - we may have
      // only replication 1 for a brief moment after close, since
      // closing only waits for fs.replcation.min replicas, and
      // it may take some millis before the other DN reports block
      waitForBlockReplication(fs1, file1.toString(), 2, 1);


      assertFileSize(fs1, BLOCK_SIZE*3/4);
      checkFile(fs1, BLOCK_SIZE*3/4);


      LOG.info("Checking replication");
      // verify that, over time, the block has been replicated to 3 DN
      cluster.getNameNode().getNamesystem().restartReplicationWork();
      waitForBlockReplication(fs1, file1.toString(), 3, 20);
    } finally {
      fs1.close();
      cluster.shutdown();
    }
  }


  // we test different datanodes restarting to exercise 
  // the start, middle, & end of the DFSOutputStream pipeline
  /* Disable checksum tests for now since they are not valid
  public void testAppendSyncChecksum0() throws Exception {
    checksumTest(0);
  }
  public void testAppendSyncChecksum1() throws Exception {
    checksumTest(1);
  }
  public void testAppendSyncChecksum2() throws Exception {
    checksumTest(2);
  }
  */


  void checksumTest(int goodDN) throws Exception {
    int deadDN = (goodDN + 1) % 3;
    int corruptDN  = (goodDN + 2) % 3;


    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      int halfBlock = (int)BLOCK_SIZE/2;
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE%8 == 0);


      file1 = new Path("/appendBadChecksum.dat");


      // write 1/2 block & sync
      stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, halfBlock);
      stm.sync();
      assertNumCurrentReplicas(rep);
      
      // close one of the datanodes
      cluster.stopDataNode(deadDN);
      
      // write 1/4 block & sync
      AppendTestUtil.write(stm, halfBlock, (int)BLOCK_SIZE/4);
      stm.sync();
      assertNumCurrentReplicas((short)(rep - 1));
      
      // stop the cluster
      cluster.getNameNode().setSafeMode(SafeModeAction.SAFEMODE_ENTER);
      fs1.close();
      cluster.shutdown();
      LOG.info("STOPPED first instance of the cluster");


      // give the second datanode a bad CRC
      corruptDataNode(corruptDN, CorruptionType.CORRUPT_LAST_CHUNK);
      
      // restart the cluster
      cluster = new MiniDFSCluster(conf, 3, false, null);
      cluster.getNameNode().getNamesystem().stallReplicationWork();
      cluster.waitActive();
      fs1 = cluster.getFileSystem();
      LOG.info("START second instance.");


      // verify that only the good datanode's file is used
      recoverFile(fs1);


      BlockLocation[] bl = fs1.getFileBlockLocations(
          fs1.getFileStatus(file1), 0, BLOCK_SIZE);
      assertTrue("Should have one block", bl.length == 1);
      assertTrue("Should have 1 replica for that block, not " +
          bl[0].getNames().length, bl[0].getNames().length == 1);


      assertTrue("The replica should be the datanode with the correct CRC",
                 cluster.getDataNodes().get(goodDN).getSelfAddr().toString()
                   .endsWith(bl[0].getNames()[0]) );
      assertFileSize(fs1, BLOCK_SIZE*3/4);


      // should fail checkFile() if data with the bad CRC was used
      checkFile(fs1, BLOCK_SIZE*3/4);


      // ensure proper re-replication
      cluster.getNameNode().getNamesystem().restartReplicationWork();
      waitForBlockReplication(fs1, file1.toString(), 3, 20);
    } finally {
      fs1.close();
      cluster.shutdown();
    }
  }


  // we test different datanodes dying and not coming back
  public void testDnDeath0() throws Exception {
    dnDeathTest(0);
  }
  public void testDnDeath1() throws Exception {
    dnDeathTest(1);
  }
  public void testDnDeath2() throws Exception {
    dnDeathTest(2);
  }


  /**
   * Test case that writes and completes a file, and then
   * tries to recover the file after the old primary
   * DN has failed.
   */
  void dnDeathTest(int badDN) throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      int halfBlock = (int)BLOCK_SIZE/2;
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE%4 == 0);


      file1 = new Path("/dnDeath.dat");


      // write 1/2 block & close
      stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, halfBlock);
      stm.close();


      // close one of the datanodes
      cluster.stopDataNode(badDN);


      // Recover the lease
      recoverFile(fs1);
      checkFile(fs1, halfBlock);
    } finally {
      fs1.close();
      cluster.shutdown();
    }
  }


  /**
   * This is a dummy test that does nothing
   */   
  public void testDummyTest() throws IOException {
  } 


  /**
   * Test for what happens when the machine doing the write totally
   * loses power, and thus when it restarts, the local replica has been
   * truncated to 0 bytes (very common with journaling filesystems)
   */
  public void testTruncatedPrimaryDN() throws Exception {
    LOG.info("START");
    runDNRestartCorruptType(CorruptionType.TRUNCATE_BLOCK_TO_ZERO);
  }


  /**
   * Test for what happens when the machine doing the write loses power
   * but a previous length of the block being written had made it to the
   * journal
   */
  public void testHalfLengthPrimaryDN() throws Exception {
    LOG.info("START");
    runDNRestartCorruptType(CorruptionType.TRUNCATE_BLOCK_HALF);
  }


  private void runDNRestartCorruptType(CorruptionType corrupt) throws Exception {
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE%4 == 0);


      file1 = new Path("/dnDeath.dat");


      // write 1/2 block & close
      stm = fs1.create(file1, true, 1024, rep, 4096);
      AppendTestUtil.write(stm, 0, 1024);
      stm.sync();
      loseLeases(fs1);


      DFSOutputStream dfso = (DFSOutputStream)stm.getWrappedStream();
      dfso.abortForTests();


      // close the primary DN
      DataNodeProperties badDN = cluster.stopDataNode(0);


      // Truncate the block on the primary DN
      corruptDataNode(0, corrupt);


      // Start the DN back up
      cluster.restartDataNode(badDN);


      // Recover the lease
      FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
      recoverFile(fs2);


      assertFileSize(fs2, 1024);
      checkFile(fs2, 1024);
    } finally {
      // explicitly do not shut down fs1, since it's been frozen up by
      // killing the DataStreamer and not allowing recovery
      cluster.shutdown();
    }
  }


  public void testFullClusterPowerLoss() throws Exception {
    cluster = new MiniDFSCluster(conf, 2, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      short rep = 2; // replication
      assertTrue(BLOCK_SIZE%4 == 0);


      file1 = new Path("/dnDeath.dat");


      // write 1/2 block & close
      stm = fs1.create(file1, true, 1024, rep, 4096);
      AppendTestUtil.write(stm, 0, 1024);
      stm.sync();
      loseLeases(fs1);


      DFSOutputStream dfso = (DFSOutputStream)stm.getWrappedStream();
      dfso.abortForTests();


      // close the DNs
      DataNodeProperties badDN = cluster.stopDataNode(0);
      DataNodeProperties badDN2 = cluster.stopDataNode(0); // what was 1 is now 0
      assertNotNull(badDN);
      assertNotNull(badDN2);


      // Truncate one of them as if its journal got corrupted
      corruptDataNode(0, CorruptionType.TRUNCATE_BLOCK_HALF);


      // Start the DN back up
      cluster.restartDataNode(badDN);
      cluster.restartDataNode(badDN2);


      // Wait for a heartbeat to make sure we get the initial block
      // report of the replicasBeingWritten
      cluster.waitForDNHeartbeat(0, 10000);
      cluster.waitForDNHeartbeat(1, 10000);


      // Recover the lease
      FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
      recoverFile(fs2);


      assertFileSize(fs2, 512);
      checkFile(fs2, 512);
    } finally {
      // explicitly do not shut down fs1, since it's been frozen up by
      // killing the DataStreamer and not allowing recovery
      cluster.shutdown();
    }
  }


  /**
   * Test that a file is not considered complete when it only has in-progress
   * blocks. This ensures that when a block is appended to, it is converted
   * back into the right kind of "in progress" state.
   */
  public void testNotPrematurelyComplete() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      int halfBlock = (int)BLOCK_SIZE/2;
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE%4 == 0);


      file1 = new Path("/delayedReceiveBlock");


      // write 1/2 block & close
      stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, halfBlock);
      stm.close();


      NameNode nn = cluster.getNameNode();
      LOG.info("======== Appending");
      stm = fs1.append(file1);
      LOG.info("======== Writing");
      AppendTestUtil.write(stm, 0, halfBlock/2);
      LOG.info("======== Checking progress");
      assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));
      LOG.info("======== Closing");
      stm.close();


    } catch (Throwable e) {
      e.printStackTrace();
      throw new IOException(e);
    } finally {
      LOG.info("======== Cleaning up");
      fs1.close();
      cluster.shutdown();
    }
  }


  /**
   * Test that the restart of a DN and the subsequent pipeline recovery do not cause
   * a file to become prematurely considered "complete". (ie that the block
   * synchronization as part of pipeline recovery doesn't add the block to the
   * nodes taking part in recovery)
   */
  public void testNotPrematurelyCompleteWithFailure() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();
    try {
      int halfBlock = (int)BLOCK_SIZE/2;
      short rep = 3; // replication
      assertTrue(BLOCK_SIZE%4 == 0);


      file1 = new Path("/delayedReceiveBlock");


      // write 1/2 block & close
      stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, BLOCK_SIZE);
      AppendTestUtil.write(stm, 0, halfBlock);
      stm.close();


      NameNode nn = cluster.getNameNode();
      LOG.info("======== Appending");
      stm = fs1.append(file1);
      LOG.info("======== Writing");
      AppendTestUtil.write(stm, 0, halfBlock/4);


      // restart one of the datanodes and wait for a few of its heartbeats
      // so that it will report the recovered replica
      MiniDFSCluster.DataNodeProperties dnprops = cluster.stopDataNode(0);
      stm.sync();
      assertTrue(cluster.restartDataNode(dnprops));
      for (int i = 0; i < 2; i++) {
        cluster.waitForDNHeartbeat(0, 3000);
      }


      AppendTestUtil.write(stm, 0, halfBlock/4);


      LOG.info("======== Checking progress");
      assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));
      LOG.info("======== Closing");
      stm.close();


    } finally {
      LOG.info("======== Cleaning up");
      fs1.close();
      cluster.shutdown();
    }
  }


  /**
   * Test that the restart of a DN and the subsequent pipeline recovery do not cause
   * a file to become prematurely considered "complete", when it's a fresh file
   * with no .append() called.
   */
  public void testNotPrematurelyCompleteWithFailureNotReopened() throws Exception {
    LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    NameNode nn = cluster.getNameNode();
    FileSystem fs1 = cluster.getFileSystem();
    try {
      short rep = 3; // replication


      file1 = new Path("/delayedReceiveBlock");


      stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, 64*1024*1024);
      LOG.info("======== Writing");
      AppendTestUtil.write(stm, 0, 1024*1024);


      LOG.info("======== Waiting for a block allocation");
      waitForBlockReplication(fs1, "/delayedReceiveBlock", 0, 3000);


      LOG.info("======== Checking not complete");
      assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));


      // Stop one of the DNs, don't restart
      MiniDFSCluster.DataNodeProperties dnprops = cluster.stopDataNode(0);


      // Write some more data
      AppendTestUtil.write(stm, 0, 1024*1024);


      // Make sure we don't see the file as complete
      LOG.info("======== Checking progress");
      assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));
      LOG.info("======== Closing");
      stm.close();


    } finally {
      LOG.info("======== Cleaning up");
      fs1.close();
      cluster.shutdown();
    }
  }


  /**
   * Test case where recovery starts on one node, but it's very slow
   * (delayed right after nextGenerationStamp). A second recovery attempt
   * completes while this one is being slow. Then we should reject the
   * recovery from the first one, since it has a lower gen stamp.
   */
  public void testSimultaneousRecoveries() throws Exception {
        LOG.info("START");
    cluster = new MiniDFSCluster(conf, 3, true, null);
    FileSystem fs1 = cluster.getFileSystem();;
    final FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
    try {
      createFile(fs1, "/testSimultaneousRecoveries", 3, BBW_SIZE);
      stm.sync();
      loseLeases(fs1);


      // Make the first nextGenerationStamp call get delayed
      DelayAnswer delayer = new DelayAnswer(false);


      NameNode nn = cluster.getNameNode();
      nn.namesystem = spy(nn.namesystem);
      doAnswer(delayer).when(nn.namesystem).nextGenerationStampForBlock(
          (Block)anyObject(), anyBoolean());


      final AtomicReference<Throwable> err = new AtomicReference<Throwable>();
      Thread recoverThread = new Thread("Recovery thread") {
        public void run() {
          try {
            recoverFile(fs2);
          } catch (Throwable t) {
            err.set(t);
          }
        }
      };
      recoverThread.start();


      LOG.info("Waiting for first nextGenerationStamp to return");
      delayer.waitForCall();


      LOG.info("Allowing recovery time to try again");
      Thread.sleep(10000);


      LOG.info("Proceeding first recovery with old GS");
      delayer.proceed();


      LOG.info("Joining on recovery thread");
      recoverThread.join();


      LOG.info("Waiting a few seconds for blocks to get corrupted");
      Thread.sleep(5000);


      // close() should write recovered bbw to HDFS block
      assertFileSize(fs2, BBW_SIZE);
      checkFile(fs2, BBW_SIZE);
    } finally {
      fs2.close();
      fs1.close();
      cluster.shutdown();
    }
    LOG.info("STOP");
  }




  /**
   * Mockito answer helper that triggers one latch as soon as the
   * method is called, then waits on another before continuing.
   */
  public static class DelayAnswer implements Answer {
    private final CountDownLatch fireLatch = new CountDownLatch(1);
    private final CountDownLatch waitLatch = new CountDownLatch(1);


    boolean delayBefore = true;


    int numTimes = 1;


    public DelayAnswer() {}
    public DelayAnswer(boolean delayBefore) {
      this(delayBefore, 1);
    }


    public DelayAnswer(boolean delayBefore, int numTimes) {
      this.delayBefore = delayBefore;
      this.numTimes = numTimes;
    }


    /**
     * Wait until the method is called.
     */
    public void waitForCall() throws InterruptedException {
      fireLatch.await();
    }


    /**
     * Wait until the method is called with a timeout
     */
    public void waitForCall(long timeoutMs) throws InterruptedException {
      fireLatch.await(timeoutMs, TimeUnit.MILLISECONDS);
    }


    /**
     * Tell the method to proceed.
     * This should only be called after waitForCall()
     */
    public void proceed() {
      waitLatch.countDown();
    }


    private void doDelay() throws Throwable {
      synchronized (this) {
        if (--numTimes < 0) return;
      }


      LOG.info("DelayAnswer firing fireLatch");
      fireLatch.countDown();
      try {
        LOG.info("DelayAnswer waiting on waitLatch");
        waitLatch.await();
        LOG.info("DelayAnswer delay complete");
      } catch (InterruptedException ie) {
        throw new IOException("Interrupted waiting on latch", ie);
      }
    }


    public Object answer(InvocationOnMock invocation) throws Throwable {
      if (delayBefore) doDelay();
      Object ret = invocation.callRealMethod();
      if (!delayBefore) doDelay();
      return ret;
    }
  }
}
Source Code of org.apache.hadoop.hdfs.TestFileAppend4$DelayAnswer

Related Classes of org.apache.hadoop.hdfs.TestFileAppend4$DelayAnswer