Package org.apache.hadoop.raid

Source Code of org.apache.hadoop.raid.BlockReconstructor

package org.apache.hadoop.raid;

import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.channels.FileChannel;
import java.nio.channels.SocketChannel;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.zip.CRC32;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FilterFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlockWithMetaInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlocksWithMetaInfo;
import org.apache.hadoop.hdfs.protocol.VersionAndOpcode;
import org.apache.hadoop.hdfs.protocol.VersionedLocatedBlocks;
import org.apache.hadoop.hdfs.protocol.WriteBlockHeader;
import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockDataFile;
import org.apache.hadoop.hdfs.server.datanode.BlockSender;
import org.apache.hadoop.hdfs.server.datanode.BlockWithChecksumFileReader;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.FSDataset;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.raid.StripeStore.StripeInfo;
import org.apache.hadoop.raid.LogUtils.LOGRESULTS;
import org.apache.hadoop.raid.LogUtils.LOGTYPES;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.Progressable;

/**
* this class implements the actual reconstructing functionality
* we keep this in a separate class so that
* the distributed block fixer can use it
*/
abstract class BlockReconstructor extends Configured {

  public static final Log LOG = LogFactory.getLog(BlockReconstructor.class);
  public static final int SEND_BLOCK_MAX_RETRIES = 3;

  BlockReconstructor(Configuration conf) throws IOException {
    super(conf);
  }

  /**
   * Is the path a parity file of a given Codec?
   */
  boolean isParityFile(Path p, Codec c) {
    return isParityFile(p.toUri().getPath(), c);
  }

  boolean isParityFile(String pathStr, Codec c) {
    if (pathStr.contains(RaidNode.HAR_SUFFIX)) {
      return false;
    }
    return pathStr.startsWith(c.getParityPrefix());
  }
 
  /**
   * Fix a file, report progess.
   *
   * @return true if file was reconstructed, false if no reconstruction
   * was necessary or possible.
   */
  boolean reconstructFile(Path srcPath, Context context)
      throws IOException, InterruptedException {
    Progressable progress = context;
    if (progress == null) {
      progress = RaidUtils.NULL_PROGRESSABLE;
    }
   
    FileSystem fs = srcPath.getFileSystem(getConf());
    FileStatus srcStat = null;
    try {
      srcStat = fs.getFileStatus(srcPath);
    } catch (FileNotFoundException ex) {
      return false;
    }

    if (RaidNode.isParityHarPartFile(srcPath)) {
      return processParityHarPartFile(srcPath, progress);
    }

    // Reconstruct parity file
    for (Codec codec : Codec.getCodecs()) {
      if (isParityFile(srcPath, codec)) {
        Decoder decoder = new Decoder(getConf(), codec);
        decoder.connectToStore(srcPath);
        return processParityFile(srcPath,
            decoder, context);
      }
    }

    // Reconstruct source file without connecting to stripe store
    for (Codec codec : Codec.getCodecs()) {
      ParityFilePair ppair = ParityFilePair.getParityFile(
          codec, srcStat, getConf());
      if (ppair != null) {
        Decoder decoder = new Decoder(getConf(), codec);
        decoder.connectToStore(srcPath);
        return processFile(srcPath, ppair, decoder, false, context);
      }
    }
    // Reconstruct source file through stripe store
    for (Codec codec : Codec.getCodecs()) {
      if (!codec.isDirRaid) {
        continue;
      }
      try {
        // try to fix through the stripe store.
        Decoder decoder = new Decoder(getConf(), codec);
        decoder.connectToStore(srcPath);
        if (processFile(srcPath, null, decoder, true, context)) {
          return true;
        }
      } catch (Exception ex) {
        LogUtils.logRaidReconstructionMetrics(LOGRESULTS.FAILURE, 0,
            codec, srcPath, -1, LOGTYPES.OFFLINE_RECONSTRUCTION_USE_STRIPE,
            fs, ex, context);
      }
    }
   
    return false;   
  }

  /**
   * Sorts source files ahead of parity files.
   */
  void sortLostFiles(List<String> files) {
    // TODO: We should first fix the files that lose more blocks
    Comparator<String> comp = new Comparator<String>() {
      public int compare(String p1, String p2) {
        Codec c1 = null;
        Codec c2 = null;
        for (Codec codec : Codec.getCodecs()) {
          if (isParityFile(p1, codec)) {
            c1 = codec;
          } else if (isParityFile(p2, codec)) {
            c2 = codec;
          }
        }
        if (c1 == null && c2 == null) {
          return 0; // both are source files
        }
        if (c1 == null && c2 != null) {
          return -1; // only p1 is a source file
        }
        if (c2 == null && c1 != null) {
          return 1; // only p2 is a source file
        }
        return c2.priority - c1.priority; // descending order
      }
    };
    Collections.sort(files, comp);
  }

  /**
   * Returns a DistributedFileSystem hosting the path supplied.
   */
  protected DistributedFileSystem getDFS(Path p) throws IOException {
    FileSystem fs = p.getFileSystem(getConf());
    DistributedFileSystem dfs = null;
    if (fs instanceof DistributedFileSystem) {
      dfs = (DistributedFileSystem) fs;
    } else if (fs instanceof FilterFileSystem) {
      FilterFileSystem ffs = (FilterFileSystem) fs;
      if (ffs.getRawFileSystem() instanceof DistributedFileSystem) {
        dfs = (DistributedFileSystem) ffs.getRawFileSystem();
      }
    }
    return dfs;
  }

  /**
   * Throw exceptions for blocks with lost checksums or stripes
   */
  void checkLostBlocks(List<Block> blocksLostChecksum,
      List<Block> blocksLostStripe, Path p, Codec codec)
      throws IOException {
    StringBuilder message = new StringBuilder();
    if (blocksLostChecksum.size() > 0) {
      message.append("Lost " + blocksLostChecksum.size() +
          " checksums in blocks:");
      for (Block blk : blocksLostChecksum) {
        message.append(" ");
        message.append(blk.toString());
      }
    }
    if (blocksLostStripe.size() > 0) {
      message.append("Lost " + blocksLostStripe.size() +
          " stripes in blocks:");
      for (Block blk : blocksLostStripe) {
        message.append(" ");
        message.append(blk.toString());
      }
    }
    if (message.length() == 0)
      return;
    message.append(" in file " + p);
    throw new IOException(message.toString());
  }
 
  private boolean abortReconstruction(Long oldCRC, Decoder decoder) {
    // If current codec is simulated file-level raid,
    // We assume we only have XOR and RS
    // it's allowed to lose checksums
    return oldCRC == null && decoder.checksumStore != null &&
        (decoder.codec.isDirRaid ||
        !decoder.codec.simulateBlockFix ||
         decoder.requiredChecksumVerification);
  }
 
  /**
   * Reads through a source file reconstructing lost blocks on the way.
   * @param srcPath Path identifying the lost file.
   * @throws IOException
   * @return true if file was reconstructed, false if no reconstruction
   * was necessary or possible.
   */
  public boolean processFile(Path srcPath, ParityFilePair parityPair,
      Decoder decoder, Boolean fromStripeStore, Context context)
          throws IOException,
      InterruptedException {
    LOG.info("Processing file " + srcPath);
    Progressable progress = context;
    if (progress == null) {
      progress = RaidUtils.NULL_PROGRESSABLE;
    }
   
    DistributedFileSystem srcFs = getDFS(srcPath);
    FileStatus srcStat = srcFs.getFileStatus(srcPath);
    long blockSize = srcStat.getBlockSize();
    long srcFileSize = srcStat.getLen();
    String uriPath = srcPath.toUri().getPath();

    int numBlocksReconstructed = 0;
    List<LocatedBlockWithMetaInfo> lostBlocks = lostBlocksInFile(srcFs,
        uriPath, srcStat);
    if (lostBlocks.size() == 0) {
      LOG.warn("Couldn't find any lost blocks in file " + srcPath +
          ", ignoring...");
      return false;
    }
    List<Block> blocksLostChecksum = new ArrayList<Block>();
    List<Block> blocksLostStripe = new ArrayList<Block>();
   
    for (LocatedBlockWithMetaInfo lb: lostBlocks) {
      Block lostBlock = lb.getBlock();
      long lostBlockOffset = lb.getStartOffset();

      LOG.info("Found lost block " + lostBlock +
          ", offset " + lostBlockOffset);
      Long oldCRC = decoder.retrieveChecksum(lostBlock,
          srcPath, lostBlockOffset, srcFs, context);
      if (abortReconstruction(oldCRC, decoder)) {
        blocksLostChecksum.add(lostBlock);
        continue;
      }
      StripeInfo si = decoder.retrieveStripe(lostBlock,
          srcPath, lostBlockOffset, srcFs, context, false);
      if (si == null && decoder.stripeStore != null) {
        blocksLostStripe.add(lostBlock);
        continue;
      }

      final long blockContentsSize =
        Math.min(blockSize, srcFileSize - lostBlockOffset);
      File localBlockFile =
        File.createTempFile(lostBlock.getBlockName(), ".tmp");
      localBlockFile.deleteOnExit();

      try {
        CRC32 crc = null;
       
        if (fromStripeStore) {
          crc = decoder.recoverBlockToFileFromStripeInfo(srcFs, srcPath,
              lostBlock, localBlockFile, blockSize,
              lostBlockOffset, blockContentsSize, si, context);
        } else {
          crc = decoder.recoverBlockToFile(srcFs, srcStat,
                        parityPair.getFileSystem(),
                        parityPair.getPath(), blockSize,
                        lostBlockOffset, localBlockFile,
                        blockContentsSize, si, context);
        }
        LOG.info("Recovered crc: " + ((crc == null)?null: crc.getValue()) +
            " expected crc:" + oldCRC);
        if (crc != null && oldCRC != null &&
            crc.getValue() != oldCRC) {
          // checksum doesn't match, it's dangerous to send it
          IOException ioe = new IOException("Block " + lostBlock.toString() +
              " new checksum " + crc.getValue() +
              " doesn't match the old one " + oldCRC);
          LogUtils.logRaidReconstructionMetrics(LOGRESULTS.FAILURE, 0,
              decoder.codec, srcPath, lostBlockOffset,
              LOGTYPES.OFFLINE_RECONSTRUCTION_CHECKSUM_VERIFICATION,
              srcFs, ioe, context);
          throw ioe;
        }
        // Now that we have recovered the file block locally, send it.
        computeMetadataAndSendReconstructedBlock(localBlockFile,
            lostBlock, blockContentsSize,
            lb.getLocations(),
            lb.getDataProtocolVersion(),
            lb.getNamespaceID(),
            progress);
       
        numBlocksReconstructed++;

      } finally {
        localBlockFile.delete();
      }
      progress.progress();
    }
   
    LOG.info("Reconstructed " + numBlocksReconstructed + " blocks in " + srcPath);
    checkLostBlocks(blocksLostChecksum, blocksLostStripe, srcPath, decoder.codec);
    return true;
  }

  /**
   * Reads through a parity file, reconstructing lost blocks on the way.
   * This function uses the corresponding source file to regenerate parity
   * file blocks.
   * @return true if file was reconstructed, false if no reconstruction
   * was necessary or possible.
   */
  boolean processParityFile(Path parityPath, Decoder decoder,
      Context context)
  throws IOException, InterruptedException {
    LOG.info("Processing parity file " + parityPath);
   
    Progressable progress = context;
    if (progress == null) {
      progress = RaidUtils.NULL_PROGRESSABLE;
    }
    DistributedFileSystem parityFs = getDFS(parityPath);
    Path srcPath = RaidUtils.sourcePathFromParityPath(parityPath, parityFs);
    if (srcPath == null) {
      LOG.warn("Could not get regular file corresponding to parity file "
          parityPath + ", ignoring...");
      return false;
    }
   
    DistributedFileSystem srcFs = getDFS(srcPath);
    FileStatus parityStat = parityFs.getFileStatus(parityPath);
    long blockSize = parityStat.getBlockSize();
    FileStatus srcStat = srcFs.getFileStatus(srcPath);

    // Check timestamp.
    if (srcStat.getModificationTime() != parityStat.getModificationTime()) {
      LOG.warn("Mismatching timestamp for " + srcPath + " and " + parityPath +
          ", ignoring...");
      return false;
    }

    String uriPath = parityPath.toUri().getPath();
    int numBlocksReconstructed = 0;
    List<LocatedBlockWithMetaInfo> lostBlocks =
      lostBlocksInFile(parityFs, uriPath, parityStat);
    if (lostBlocks.size() == 0) {
      LOG.warn("Couldn't find any lost blocks in parity file " + parityPath +
          ", ignoring...");
      return false;
    }
    List<Block> blocksLostChecksum = new ArrayList<Block>();
    List<Block> blocksLostStripe = new ArrayList<Block>();
   
    for (LocatedBlockWithMetaInfo lb: lostBlocks) {
      Block lostBlock = lb.getBlock();
      long lostBlockOffset = lb.getStartOffset();
      LOG.info("Found lost block " + lostBlock +
          ", offset " + lostBlockOffset);
     
      Long oldCRC = decoder.retrieveChecksum(lostBlock,
          parityPath, lostBlockOffset, parityFs, context);
      if (abortReconstruction(oldCRC, decoder)) {
        blocksLostChecksum.add(lostBlock);
        continue;
      }
      StripeInfo si = decoder.retrieveStripe(lostBlock,
          srcPath, lostBlockOffset, srcFs, context, false);
      if (si == null && decoder.stripeStore != null) {
        blocksLostStripe.add(lostBlock);
        continue;
      }
     
      File localBlockFile =
        File.createTempFile(lostBlock.getBlockName(), ".tmp");
      localBlockFile.deleteOnExit();

      try {
        CRC32 crc = decoder.recoverParityBlockToFile(srcFs, srcStat, parityFs,
            parityPath, blockSize, lostBlockOffset, localBlockFile, si,
            context);
        LOG.info("Recovered crc: " + ((crc == null)?null: crc.getValue()) +
            " expected crc:" + oldCRC);
        if (crc != null && oldCRC != null &&
            crc.getValue() != oldCRC) {
          // checksum doesn't match, it's dangerous to send it
          IOException ioe = new IOException("Block " + lostBlock.toString()
              + " new checksum " + crc.getValue()
              + " doesn't match the old one " + oldCRC);
          LogUtils.logRaidReconstructionMetrics(LOGRESULTS.FAILURE, 0,
              decoder.codec, parityPath, lostBlockOffset,
              LOGTYPES.OFFLINE_RECONSTRUCTION_CHECKSUM_VERIFICATION,
              parityFs, ioe, context);
          throw ioe;
        }
        // Now that we have recovered the parity file block locally, send it.
        computeMetadataAndSendReconstructedBlock(
            localBlockFile,
            lostBlock, blockSize, lb.getLocations(),
            lb.getDataProtocolVersion(), lb.getNamespaceID(),
            progress);

        numBlocksReconstructed++;
      } finally {
        localBlockFile.delete();
      }
      progress.progress();
    }
   
    LOG.info("Reconstructed " + numBlocksReconstructed + " blocks in " + parityPath);
    checkLostBlocks(blocksLostChecksum, blocksLostStripe, parityPath, decoder.codec);
    return true;
  }

  /**
   * Reads through a parity HAR part file, reconstructing lost blocks on the way.
   * A HAR block can contain many file blocks, as long as the HAR part file
   * block size is a multiple of the file block size.
   * @return true if file was reconstructed, false if no reconstruction
   * was necessary or possible.
   */
  boolean processParityHarPartFile(Path partFile,
      Progressable progress)
  throws IOException {
    LOG.info("Processing parity HAR file " + partFile);
    // Get some basic information.
    DistributedFileSystem dfs = getDFS(partFile);
    FileStatus partFileStat = dfs.getFileStatus(partFile);
    long partFileBlockSize = partFileStat.getBlockSize();
    LOG.info(partFile + " has block size " + partFileBlockSize);

    // Find the path to the index file.
    // Parity file HARs are only one level deep, so the index files is at the
    // same level as the part file.
    // Parses through the HAR index file.
    HarIndex harIndex = HarIndex.getHarIndex(dfs, partFile);
    String uriPath = partFile.toUri().getPath();
    int numBlocksReconstructed = 0;
    List<LocatedBlockWithMetaInfo> lostBlocks = lostBlocksInFile(dfs, uriPath,
        partFileStat);
    if (lostBlocks.size() == 0) {
      LOG.warn("Couldn't find any lost blocks in HAR file " + partFile +
          ", ignoring...");
      return false;
    }
    for (LocatedBlockWithMetaInfo lb: lostBlocks) {
      Block lostBlock = lb.getBlock();
      long lostBlockOffset = lb.getStartOffset();

      File localBlockFile =
        File.createTempFile(lostBlock.getBlockName(), ".tmp");
      localBlockFile.deleteOnExit();

      try {
        processParityHarPartBlock(dfs, partFile,
            lostBlockOffset, partFileStat, harIndex,
            localBlockFile, progress);
       
        // Now that we have recovered the part file block locally, send it.
        computeMetadataAndSendReconstructedBlock(localBlockFile,
            lostBlock,
            localBlockFile.length(),
            lb.getLocations(),
            lb.getDataProtocolVersion(), lb.getNamespaceID(),
            progress);
       
        numBlocksReconstructed++;
      } finally {
        localBlockFile.delete();
      }
      progress.progress();
    }
   
    LOG.info("Reconstructed " + numBlocksReconstructed + " blocks in " + partFile);
    return true;
  }

  /**
   * This reconstructs a single part file block by recovering in sequence each
   * parity block in the part file block.
   */
  private void processParityHarPartBlock(FileSystem dfs, Path partFile,
      long blockOffset,
      FileStatus partFileStat,
      HarIndex harIndex,
      File localBlockFile,
      Progressable progress)
  throws IOException {
    String partName = partFile.toUri().getPath(); // Temporarily.
    partName = partName.substring(1 + partName.lastIndexOf(Path.SEPARATOR));

    OutputStream out = new FileOutputStream(localBlockFile);

    try {
      // A HAR part file block could map to several parity files. We need to
      // use all of them to recover this block.
      final long blockEnd = Math.min(blockOffset +
          partFileStat.getBlockSize(),
          partFileStat.getLen());
      for (long offset = blockOffset; offset < blockEnd; ) {
        HarIndex.IndexEntry entry = harIndex.findEntry(partName, offset);
        if (entry == null) {
          String msg = "Lost index file has no matching index entry for " +
          partName + ":" + offset;
          LOG.warn(msg);
          throw new IOException(msg);
        }
        Path parityFile = new Path(entry.fileName);
        Encoder encoder = null;
        for (Codec codec : Codec.getCodecs()) {
          if (isParityFile(parityFile, codec)) {
            encoder = new Encoder(getConf(), codec);
          }
        }
        if (encoder == null) {
          String msg = "Could not figure out codec correctly for " + parityFile;
          LOG.warn(msg);
          throw new IOException(msg);
        }
        Path srcFile = RaidUtils.sourcePathFromParityPath(parityFile, dfs);
        if (null == srcFile) {
          String msg = "Can not find the source path for parity file: " +
                          parityFile;
          LOG.warn(msg);
          throw new IOException(msg);
        }
        FileStatus srcStat = dfs.getFileStatus(srcFile);
        if (srcStat.getModificationTime() != entry.mtime) {
          String msg = "Modification times of " + parityFile + " and " +
          srcFile + " do not match.";
          LOG.warn(msg);
          throw new IOException(msg);
        }
        long lostOffsetInParity = offset - entry.startOffset;
        LOG.info(partFile + ":" + offset + " maps to " +
            parityFile + ":" + lostOffsetInParity +
            " and will be recovered from " + srcFile);
        encoder.recoverParityBlockToStream(dfs, srcStat,
            srcStat.getBlockSize(), parityFile,
            lostOffsetInParity, out, progress);
        // Finished recovery of one parity block. Since a parity block has the
        // same size as a source block, we can move offset by source block
        // size.
        offset += srcStat.getBlockSize();
        LOG.info("Recovered " + srcStat.getBlockSize() + " part file bytes ");
        if (offset > blockEnd) {
          String msg =
            "Recovered block spills across part file blocks. Cannot continue";
          throw new IOException(msg);
        }
        progress.progress();
      }
    } finally {
      out.close();
    }
  }

  /**
   * Choose a datanode (hostname:portnumber). The datanode is chosen at
   * random from the live datanodes.
   * @param locationsToAvoid locations to avoid.
   * @return A chosen datanode.
   * @throws IOException
   */
  private DatanodeInfo chooseDatanode(DatanodeInfo[] locationsToAvoid,
                          DatanodeInfo[] live)
  throws IOException {
    LOG.info("Choosing a datanode from " + live.length +
        " live nodes while avoiding " + locationsToAvoid.length);
    Random rand = new Random();
    DatanodeInfo chosen = null;
    int maxAttempts = 1000;
    for (int i = 0; i < maxAttempts && chosen == null; i++) {
      int idx = rand.nextInt(live.length);
      chosen = live[idx];
      for (DatanodeInfo avoid: locationsToAvoid) {
        if (chosen.equals(avoid)) {
          LOG.info("Avoiding " + avoid.name);
          chosen = null;
          break;
        }
      }
    }
    if (chosen == null) {
      throw new IOException("Could not choose datanode");
    }
    LOG.info("Choosing datanode " + chosen.name);
    return chosen;
  }
 
  private DatanodeInfo chooseDatanode(DatanodeInfo[] locationsToAvoid)
                throws IOException {
    DistributedFileSystem dfs = getDFS(new Path("/"));
    DatanodeInfo[] live =
        dfs.getClient().datanodeReport(DatanodeReportType.LIVE);
    return chooseDatanode(locationsToAvoid, live);
  }

  /**
   * Reads data from the data stream provided and computes metadata.
   */
  DataInputStream computeMetadata(Configuration conf, InputStream dataStream)
  throws IOException {
    ByteArrayOutputStream mdOutBase = new ByteArrayOutputStream(1024*1024);
    DataOutputStream mdOut = new DataOutputStream(mdOutBase);

    // First, write out the version.
    mdOut.writeShort(FSDataset.FORMAT_VERSION_NON_INLINECHECKSUM);

    // Create a summer and write out its header.
    int bytesPerChecksum = conf.getInt("io.bytes.per.checksum", 512);
    DataChecksum sum =
      DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_CRC32,
          bytesPerChecksum);
    sum.writeHeader(mdOut);

    // Buffer to read in a chunk of data.
    byte[] buf = new byte[bytesPerChecksum];
    // Buffer to store the checksum bytes.
    byte[] chk = new byte[sum.getChecksumSize()];

    // Read data till we reach the end of the input stream.
    int bytesSinceFlush = 0;
    while (true) {
      // Read some bytes.
      int bytesRead = dataStream.read(buf, bytesSinceFlush,
          bytesPerChecksum - bytesSinceFlush);
      if (bytesRead == -1) {
        if (bytesSinceFlush > 0) {
          boolean reset = true;
          sum.writeValue(chk, 0, reset); // This also resets the sum.
          // Write the checksum to the stream.
          mdOut.write(chk, 0, chk.length);
          bytesSinceFlush = 0;
        }
        break;
      }
      // Update the checksum.
      sum.update(buf, bytesSinceFlush, bytesRead);
      bytesSinceFlush += bytesRead;

      // Flush the checksum if necessary.
      if (bytesSinceFlush == bytesPerChecksum) {
        boolean reset = true;
        sum.writeValue(chk, 0, reset); // This also resets the sum.
        // Write the checksum to the stream.
        mdOut.write(chk, 0, chk.length);
        bytesSinceFlush = 0;
      }
    }

    byte[] mdBytes = mdOutBase.toByteArray();
    return new DataInputStream(new ByteArrayInputStream(mdBytes));
  }

  private void computeMetadataAndSendReconstructedBlock(
      File localBlockFile,
      Block block, long blockSize,
      DatanodeInfo[] locations,
      int dataTransferVersion,
      int namespaceId,
      Progressable progress)
  throws IOException {

    LOG.info("Computing metdata");
    FileInputStream blockContents = null;
    DataInputStream blockMetadata = null;
    try {
      blockContents = new FileInputStream(localBlockFile);
      blockMetadata = computeMetadata(getConf(), blockContents);
      blockContents.close();
      progress.progress();
      DatanodeInfo datanode = null;
     
      DistributedFileSystem dfs = getDFS(new Path("/"));
      DatanodeInfo[] live =
          dfs.getClient().datanodeReport(DatanodeReportType.LIVE);
     
      for (int retry = 0; retry < SEND_BLOCK_MAX_RETRIES; ++retry) {
        try {
          datanode = chooseDatanode(locations, live);
          // Reopen
          blockContents = new FileInputStream(localBlockFile);
          sendReconstructedBlock(datanode.name, blockContents, blockMetadata, block,
              blockSize, dataTransferVersion, namespaceId, progress);
          return;
        } catch (IOException ex) {
          if (retry == SEND_BLOCK_MAX_RETRIES - 1) {
            // last retry, rethrow the exception
            throw ex;
          }
         
          // log the warn and retry
          LOG.warn("Got exception when sending the reconstructed block to datanode " +
                  datanode + ", retried: " + retry + " times.", ex);
         
          // add the bad node to the locations.
          DatanodeInfo[] newLocations = new DatanodeInfo[locations.length + 1];
          System.arraycopy(locations, 0, newLocations, 0, locations.length);
          newLocations[locations.length] = datanode;
          locations = newLocations;
        }
      }
    } finally {
      if (blockContents != null) {
        blockContents.close();
        blockContents = null;
      }
      if (blockMetadata != null) {
        blockMetadata.close();
        blockMetadata = null;
      }
    }
  }

  /**
   * Send a generated block to a datanode.
   * @param datanode Chosen datanode name in host:port form.
   * @param blockContents Stream with the block contents.
   * @param block Block object identifying the block to be sent.
   * @param blockSize size of the block.
   * @param dataTransferVersion the data transfer version
   * @param namespaceId namespace id the block belongs to
   * @throws IOException
   */
  private void sendReconstructedBlock(String datanode,
      final FileInputStream blockContents,
      final DataInputStream metadataIn,
      Block block, long blockSize,
      int dataTransferVersion, int namespaceId,
      Progressable progress)
  throws IOException {
    InetSocketAddress target = NetUtils.createSocketAddr(datanode);
    Socket sock = SocketChannel.open().socket();

    int readTimeout =
      getConf().getInt(BlockIntegrityMonitor.BLOCKFIX_READ_TIMEOUT,
          HdfsConstants.READ_TIMEOUT);
    NetUtils.connect(sock, target, readTimeout);
    sock.setSoTimeout(readTimeout);

    int writeTimeout = getConf().getInt(BlockIntegrityMonitor.BLOCKFIX_WRITE_TIMEOUT,
        HdfsConstants.WRITE_TIMEOUT);

    OutputStream baseStream = NetUtils.getOutputStream(sock, writeTimeout);
    DataOutputStream out =
      new DataOutputStream(new BufferedOutputStream(baseStream,
          FSConstants.
          SMALL_BUFFER_SIZE));

    boolean corruptChecksumOk = false;
    boolean chunkOffsetOK = false;
    boolean verifyChecksum = true;
    boolean transferToAllowed = false;

    try {
      LOG.info("Sending block " + block +
          " from " + sock.getLocalSocketAddress().toString() +
          " to " + sock.getRemoteSocketAddress().toString());
      BlockSender blockSender =
        new BlockSender(namespaceId, block, blockSize, 0, blockSize,
            corruptChecksumOk, chunkOffsetOK, verifyChecksum,
            transferToAllowed, dataTransferVersion >= DataTransferProtocol.PACKET_INCLUDE_VERSION_VERSION,
            new BlockWithChecksumFileReader.InputStreamWithChecksumFactory() {
              @Override
              public InputStream createStream(long offset) throws IOException {
                // we are passing 0 as the offset above,
                // so we can safely ignore
                // the offset passed
                return blockContents;
              }

              @Override
              public DataInputStream getChecksumStream() throws IOException {
                return metadataIn;
              }

            @Override
            public BlockDataFile.Reader getBlockDataFileReader()
                throws IOException {
              return BlockDataFile.getDummyDataFileFromFileChannel(
                  blockContents.getChannel()).getReader(null);
            }
          });

      WriteBlockHeader header = new WriteBlockHeader(new VersionAndOpcode(
          dataTransferVersion, DataTransferProtocol.OP_WRITE_BLOCK));
      header.set(namespaceId, block.getBlockId(), block.getGenerationStamp(),
          0, false, true, new DatanodeInfo(), 0, null, "");
      header.writeVersionAndOpCode(out);
      header.write(out);
      blockSender.sendBlock(out, baseStream, null, progress);

      LOG.info("Sent block " + block + " to " + datanode);
    } finally {
      sock.close();
      out.close();
    }
  }

  /**
   * Returns the lost blocks in a file.
   */
  abstract List<LocatedBlockWithMetaInfo> lostBlocksInFile(
      DistributedFileSystem fs,
      String uriPath, FileStatus stat)
      throws IOException;

 
  /**
   * This class implements corrupt block fixing functionality.
   */
  public static class CorruptBlockReconstructor extends BlockReconstructor {
   
    public CorruptBlockReconstructor(Configuration conf) throws IOException {
      super(conf);
    }

   
    List<LocatedBlockWithMetaInfo> lostBlocksInFile(DistributedFileSystem fs,
                                        String uriPath,
                                        FileStatus stat)
        throws IOException {
     
      List<LocatedBlockWithMetaInfo> corrupt =
        new LinkedList<LocatedBlockWithMetaInfo>();
      VersionedLocatedBlocks locatedBlocks;
      int namespaceId = 0;
      int methodFingerprint = 0;
      if (DFSClient.isMetaInfoSuppoted(fs.getClient().namenodeProtocolProxy)) {
        LocatedBlocksWithMetaInfo lbksm = fs.getClient().namenode.
                  openAndFetchMetaInfo(uriPath, 0, stat.getLen());
        namespaceId = lbksm.getNamespaceID();
        locatedBlocks = lbksm;
        methodFingerprint = lbksm.getMethodFingerPrint();
        fs.getClient().getNewNameNodeIfNeeded(methodFingerprint);
      } else {
        locatedBlocks = fs.getClient().namenode.open(uriPath, 0, stat.getLen());
      }
      final int dataTransferVersion = locatedBlocks.getDataProtocolVersion();
      for (LocatedBlock b: locatedBlocks.getLocatedBlocks()) {
        if (b.isCorrupt() ||
            (b.getLocations().length == 0 && b.getBlockSize() > 0)) {
          corrupt.add(new LocatedBlockWithMetaInfo(b.getBlock(),
              b.getLocations(), b.getStartOffset(),
              dataTransferVersion, namespaceId, methodFingerprint));
        }
      }
      return corrupt;
    }
  }
 
  /**
   * This class implements decommissioning block copying functionality.
   */
  public static class DecommissioningBlockReconstructor extends BlockReconstructor {

    public DecommissioningBlockReconstructor(Configuration conf) throws IOException {
      super(conf);
    }

    List<LocatedBlockWithMetaInfo> lostBlocksInFile(DistributedFileSystem fs,
                                           String uriPath,
                                           FileStatus stat)
        throws IOException {

      List<LocatedBlockWithMetaInfo> decommissioning =
        new LinkedList<LocatedBlockWithMetaInfo>();
      VersionedLocatedBlocks locatedBlocks;
      int namespaceId = 0;
      int methodFingerprint = 0;
      if (DFSClient.isMetaInfoSuppoted(fs.getClient().namenodeProtocolProxy)) {
        LocatedBlocksWithMetaInfo lbksm = fs.getClient().namenode.
                  openAndFetchMetaInfo(uriPath, 0, stat.getLen());
        namespaceId = lbksm.getNamespaceID();
        locatedBlocks = lbksm;
        methodFingerprint = lbksm.getMethodFingerPrint();
        fs.getClient().getNewNameNodeIfNeeded(methodFingerprint);
      } else {
        locatedBlocks = fs.getClient().namenode.open(uriPath, 0, stat.getLen());
      }
      final int dataTransferVersion = locatedBlocks.getDataProtocolVersion();


      for (LocatedBlock b : locatedBlocks.getLocatedBlocks()) {
        if (b.isCorrupt() ||
            (b.getLocations().length == 0 && b.getBlockSize() > 0)) {
          // If corrupt, this block is the responsibility of the CorruptBlockReconstructor
          continue;
        }

        // Copy this block iff all good copies are being decommissioned
        boolean allDecommissioning = true;
        for (DatanodeInfo i : b.getLocations()) {
          allDecommissioning &= i.isDecommissionInProgress();
        }
        if (allDecommissioning) {
          decommissioning.add(new LocatedBlockWithMetaInfo(b.getBlock(),
              b.getLocations(), b.getStartOffset(),
              dataTransferVersion, namespaceId, methodFingerprint));
        }
      }
      return decommissioning;
    }

  }


}
TOP

Related Classes of org.apache.hadoop.raid.BlockReconstructor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.