/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.raid;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.net.InetSocketAddress;
import java.net.URISyntaxException;
import java.text.SimpleDateFormat;
import java.text.DateFormat;
import java.util.Date;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.StandardMBean;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.raid.Decoder.DecoderInputStream;
import org.apache.hadoop.raid.DistBlockIntegrityMonitor.CorruptFile;
import org.apache.hadoop.raid.DistBlockIntegrityMonitor.CorruptFileStatus;
import org.apache.hadoop.raid.DistBlockIntegrityMonitor.Worker;
import org.apache.hadoop.raid.DistBlockIntegrityMonitor.CorruptFileCounter;
import org.apache.hadoop.raid.DistBlockIntegrityMonitor.CorruptionWorker;
import org.apache.hadoop.raid.DistRaid.EncodingCandidate;
import org.apache.hadoop.raid.LogUtils.LOGRESULTS;
import org.apache.hadoop.raid.LogUtils.LOGTYPES;
import org.apache.hadoop.raid.StripeStore.StripeInfo;
import org.apache.hadoop.raid.protocol.PolicyInfo;
import org.apache.hadoop.raid.protocol.RaidProtocol;
import org.apache.hadoop.raid.StripeStore;
import org.apache.hadoop.tools.HadoopArchives;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.VersionInfo;
import org.json.JSONException;
import org.xml.sax.SAXException;
import org.apache.hadoop.metrics.util.MBeanUtil;
/**
* A base class that implements {@link RaidProtocol}.
*
* use raid.classname to specify which implementation to use
*/
public abstract class RaidNode implements RaidProtocol, RaidNodeStatusMBean {
static{
Configuration.addDefaultResource("hdfs-default.xml");
Configuration.addDefaultResource("mapred-default.xml");
Configuration.addDefaultResource("raid-default.xml");
Configuration.addDefaultResource("hdfs-site.xml");
Configuration.addDefaultResource("mapred-site.xml");
Configuration.addDefaultResource("raid-site.xml");
}
// The modification time of the raid candidate should be
// at least (1 day) older.
public static final long RAID_MOD_TIME_PERIOD_DEFAULT = 24 * 3600 * 1000;
public static final String RAID_MOD_TIME_PERIOD_KEY = "raid.mod.time.period";
public static final Log LOG = LogFactory.getLog(RaidNode.class);
public static final long SLEEP_TIME = 10000L; // 10 seconds
public static final String TRIGGER_MONITOR_SLEEP_TIME_KEY =
"hdfs.raid.trigger.monitor.sleep.time";
public static final String UNDER_REDUNDANT_FILES_PROCESSOR_SLEEP_TIME_KEY=
"hdfs.raid.under.redundant.files.processor.sleep.time";
public static final int DEFAULT_PORT = 60000;
// we don't raid too small files
public static final long MINIMUM_RAIDABLE_FILESIZE = 10*1024L;
public static final String MINIMUM_RAIDABLE_FILESIZE_KEY =
"hdfs.raid.min.filesize";
public static final String RAID_RECOVERY_LOCATION_KEY =
"hdfs.raid.local.recovery.location";
public static final String DEFAULT_RECOVERY_LOCATION = "/tmp/raidrecovery";
public static final String RAID_PARITY_HAR_THRESHOLD_DAYS_KEY =
"raid.parity.har.threshold.days";
public static final int DEFAULT_RAID_PARITY_HAR_THRESHOLD_DAYS = 3;
public static final String RAID_DIRECTORYTRAVERSAL_SHUFFLE =
"raid.directorytraversal.shuffle";
public static final String RAID_DIRECTORYTRAVERSAL_THREADS =
"raid.directorytraversal.threads";
public static final String RAID_UNDER_REDUNDANT_FILES =
"raid.under.redundant.files";
public static final String RAID_MAPREDUCE_UPLOAD_CLASSES =
"raid.mapreduce.upload.classes";
public static final String RAID_DISABLE_CORRUPT_BLOCK_FIXER_KEY =
"raid.blockreconstruction.corrupt.disable";
public static final String RAID_DISABLE_DECOMMISSIONING_BLOCK_COPIER_KEY =
"raid.blockreconstruction.decommissioning.disable";
public static final String RAID_DISABLE_CORRUPTFILE_COUNTER_KEY =
"raid.corruptfile.counter.disable";
public static final String RAID_CHECKSUM_STORE_CLASS_KEY =
"hdfs.raid.checksum.store.class";
// by default we don't require to use checksum store, when it's set, we will throw an exception
// if checksum store is null
public static final String RAID_CHECKSUM_STORE_REQUIRED_KEY =
"hdfs.raid.checksum.store.required";
public static final String RAID_CHECKSUM_VERIFICATION_REQUIRED_KEY =
"hdfs.raid.checksum.verification.required";
public static final String RAID_STRIPE_STORE_CLASS_KEY =
"hdfs.raid.stripe.store.class";
public static final String RAID_ENCODING_STRIPES_KEY =
"hdfs.raid.stripe.encoding";
public static final int DEFAULT_RAID_ENCODING_STRIPES = 1;
public static final String RAID_PARITY_INITIAL_REPL_KEY =
"hdfs.raid.parity.initial.repl";
public static final int DEFAULT_RAID_PARITY_INITIAL_REPL = 3;
public static final String JOBUSER = "raid";
public static final String HAR_SUFFIX = "_raid.har";
public static final Pattern PARITY_HAR_PARTFILE_PATTERN =
Pattern.compile(".*" + HAR_SUFFIX + "/part-.*");
public static final String RAIDNODE_CLASSNAME_KEY = "raid.classname";
public static final DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss-SSS");
public static Random rand = new Random();
/** RPC server */
private Server server;
/** RPC server address */
private InetSocketAddress serverAddress = null;
/** only used for testing purposes */
protected boolean stopRequested = false;
/** Configuration Manager */
private ConfigManager configMgr;
private HttpServer infoServer;
private String infoBindAddress;
private long startTime;
/** hadoop configuration */
protected Configuration conf;
protected boolean initialized = false; // Are we initialized?
protected volatile boolean running = true; // Are we running?
private static long modTimePeriod = RAID_MOD_TIME_PERIOD_DEFAULT;
/** Deamon thread to trigger policies */
Daemon triggerThread = null;
Daemon urfThread = null;
public static long triggerMonitorSleepTime = SLEEP_TIME;
public static long underRedundantFilesProcessorSleepTime = SLEEP_TIME;
/** Deamon thread to delete obsolete parity files */
PurgeMonitor purgeMonitor = null;
Daemon purgeThread = null;
/** Deamon thread to har raid directories */
Daemon harThread = null;
/** Daemon thread to fix corrupt files */
BlockIntegrityMonitor blockIntegrityMonitor = null;
Daemon blockFixerThread = null;
Daemon blockCopierThread = null;
Daemon corruptFileCounterThread = null;
/** Daemon thread to collecting statistics */
StatisticsCollector statsCollector = null;
Daemon statsCollectorThread = null;
PlacementMonitor placementMonitor = null;
Daemon placementMonitorThread = null;
TriggerMonitor triggerMonitor = null;
UnderRedundantFilesProcessor urfProcessor = null;
private int directoryTraversalThreads;
private boolean directoryTraversalShuffle;
private ObjectName beanName;
private ObjectName raidnodeMXBeanName;
// statistics about RAW hdfs blocks. This counts all replicas of a block.
public static class Statistics {
long numProcessedBlocks; // total blocks encountered in namespace
long processedSize; // disk space occupied by all blocks
long remainingSize; // total disk space post RAID
long numMetaBlocks; // total blocks in metafile
long metaSize; // total disk space for meta files
public void clear() {
numProcessedBlocks = 0;
processedSize = 0;
remainingSize = 0;
numMetaBlocks = 0;
metaSize = 0;
}
public String toString() {
long save = processedSize - (remainingSize + metaSize);
long savep = 0;
if (processedSize > 0) {
savep = (save * 100)/processedSize;
}
String msg = " numProcessedBlocks = " + numProcessedBlocks +
" processedSize = " + processedSize +
" postRaidSize = " + remainingSize +
" numMetaBlocks = " + numMetaBlocks +
" metaSize = " + metaSize +
" %save in raw disk space = " + savep;
return msg;
}
}
// Startup options
static public enum StartupOption{
TEST ("-test"),
REGULAR ("-regular");
private String name = null;
private StartupOption(String arg) {this.name = arg;}
public String getName() {return name;}
}
// For unit test
RaidNode() {}
/**
* Start RaidNode.
* <p>
* The raid-node can be started with one of the following startup options:
* <ul>
* <li>{@link StartupOption#REGULAR REGULAR} - normal raid node startup</li>
* </ul>
* The option is passed via configuration field:
* <tt>fs.raidnode.startup</tt>
*
* The conf will be modified to reflect the actual ports on which
* the RaidNode is up and running if the user passes the port as
* <code>zero</code> in the conf.
*
* @param conf confirguration
* @throws IOException
*/
RaidNode(Configuration conf) throws IOException {
try {
initialize(conf);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
this.stop();
throw e;
} catch (Exception e) {
this.stop();
throw new IOException(e);
}
}
public long getProtocolVersion(String protocol,
long clientVersion) throws IOException {
if (protocol.equals(RaidProtocol.class.getName())) {
return RaidProtocol.versionID;
} else {
throw new IOException("Unknown protocol to name node: " + protocol);
}
}
@Override
public ProtocolSignature getProtocolSignature(String protocol,
long clientVersion, int clientMethodsHash) throws IOException {
return ProtocolSignature.getProtocolSignature(
this, protocol, clientVersion, clientMethodsHash);
}
/**
* Wait for service to finish.
* (Normally, it runs forever.)
*/
public void join() {
try {
if (server != null) server.join();
if (triggerThread != null) triggerThread.join();
if (urfThread != null) urfThread.join();
if (blockFixerThread != null) blockFixerThread.join();
if (blockCopierThread != null) blockCopierThread.join();
if (corruptFileCounterThread != null) corruptFileCounterThread.join();
if (purgeThread != null) purgeThread.join();
if (statsCollectorThread != null) statsCollectorThread.join();
} catch (InterruptedException ie) {
// do nothing
}
}
/**
* Stop all RaidNode threads and wait for all to finish.
*/
public void stop() {
if (stopRequested) {
return;
}
stopRequested = true;
running = false;
if (server != null) server.stop();
if (triggerThread != null) {
triggerThread.interrupt();
triggerMonitor = null;
}
if (urfThread != null) {
urfThread.interrupt();
urfProcessor = null;
}
if (blockIntegrityMonitor != null) blockIntegrityMonitor.running = false;
if (blockFixerThread != null) blockFixerThread.interrupt();
if (blockCopierThread != null) blockCopierThread.interrupt();
if (corruptFileCounterThread != null) corruptFileCounterThread.interrupt();
if (purgeMonitor != null) purgeMonitor.running = false;
if (purgeThread != null) purgeThread.interrupt();
if (placementMonitor != null) placementMonitor.stop();
if (statsCollector != null) statsCollector.stop();
if (statsCollectorThread != null) statsCollectorThread.interrupt();
if (infoServer != null) {
try {
infoServer.stop();
} catch (Exception e) {
LOG.warn("Exception shutting down " + RaidNode.class, e);
}
}
this.unregisterMBean();
}
private static InetSocketAddress getAddress(String address) {
return NetUtils.createSocketAddr(address);
}
public static InetSocketAddress getAddress(Configuration conf) {
String nodeport = conf.get("raid.server.address");
if (nodeport == null) {
nodeport = "localhost:" + DEFAULT_PORT;
}
return getAddress(nodeport);
}
public InetSocketAddress getListenerAddress() {
return server.getListenerAddress();
}
private void cleanUpDirectory(String dir, Configuration conf)
throws IOException {
Path pdir = new Path(dir);
FileSystem fs = pdir.getFileSystem(conf);
if (fs.exists(pdir)) {
fs.delete(pdir);
}
}
private void cleanUpTempDirectory(Configuration conf) throws IOException {
for (Codec codec: Codec.getCodecs()) {
cleanUpDirectory(codec.tmpParityDirectory, conf);
cleanUpDirectory(codec.tmpHarDirectory, conf);
}
}
private void addTmpJars(Configuration conf) throws URISyntaxException {
StringBuilder jarLocations = new StringBuilder();
String[] uploadClassNames = conf.getStrings(RAID_MAPREDUCE_UPLOAD_CLASSES);
if (uploadClassNames == null || uploadClassNames.length == 0) {
LOG.warn("Key " + RAID_MAPREDUCE_UPLOAD_CLASSES + " is not defined");
return;
}
boolean first = true;
for (String uploadClassName: uploadClassNames) {
try {
String jarLocation = Class.forName(uploadClassName).getProtectionDomain().
getCodeSource().getLocation().toURI().toString();
if (!first) {
jarLocations.append(",");
}
jarLocations.append(jarLocation);
first = false;
} catch (ClassNotFoundException cnfe) {
LOG.warn("Class " + uploadClassName + " is not found", cnfe);
}
}
LOG.info("Load jars " + jarLocations.toString());
conf.set("tmpjars", jarLocations.toString());
}
private void initialize(Configuration conf)
throws IOException, SAXException, InterruptedException, RaidConfigurationException,
ClassNotFoundException, ParserConfigurationException, URISyntaxException, JSONException {
this.startTime = RaidNode.now();
this.conf = conf;
modTimePeriod = conf.getLong(RAID_MOD_TIME_PERIOD_KEY,
RAID_MOD_TIME_PERIOD_DEFAULT);
LOG.info("modTimePeriod: " + modTimePeriod);
InetSocketAddress socAddr = RaidNode.getAddress(conf);
int handlerCount = conf.getInt("fs.raidnode.handler.count", 10);
addTmpJars(this.conf);
// clean up temporay directory
cleanUpTempDirectory(conf);
// read in the configuration
configMgr = new ConfigManager(conf);
// create rpc server
this.server = RPC.getServer(this, socAddr.getAddress().getHostAddress(), socAddr.getPort(),
handlerCount, false, conf);
// create checksum store if not exist
RaidNode.createChecksumStore(conf, true);
// create stripe store if not exist
RaidNode.createStripeStore(conf, true, FileSystem.get(conf));
// The rpc-server port can be ephemeral... ensure we have the correct info
this.serverAddress = this.server.getListenerAddress();
LOG.info("RaidNode up at: " + this.serverAddress);
// Instantiate the metrics singleton.
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID);
this.server.start(); // start RPC server
// Create a block integrity monitor and start its thread(s)
this.blockIntegrityMonitor = BlockIntegrityMonitor.createBlockIntegrityMonitor(conf);
boolean useBlockFixer =
!conf.getBoolean(RAID_DISABLE_CORRUPT_BLOCK_FIXER_KEY, false);
boolean useBlockCopier =
!conf.getBoolean(RAID_DISABLE_DECOMMISSIONING_BLOCK_COPIER_KEY, true);
boolean useCorruptFileCounter =
!conf.getBoolean(RAID_DISABLE_CORRUPTFILE_COUNTER_KEY, false);
Runnable fixer = blockIntegrityMonitor.getCorruptionMonitor();
if (useBlockFixer && (fixer != null)) {
this.blockFixerThread = new Daemon(fixer);
this.blockFixerThread.setName("Block Fixer");
this.blockFixerThread.start();
}
Runnable copier = blockIntegrityMonitor.getDecommissioningMonitor();
if (useBlockCopier && (copier != null)) {
this.blockCopierThread = new Daemon(copier);
this.blockCopierThread.setName("Block Copier");
this.blockCopierThread.start();
}
Runnable counter = blockIntegrityMonitor.getCorruptFileCounter();
if (useCorruptFileCounter && counter != null) {
this.corruptFileCounterThread = new Daemon(counter);
this.corruptFileCounterThread.setName("Corrupt File Counter");
this.corruptFileCounterThread.start();
}
// start the deamon thread to fire polcies appropriately
RaidNode.triggerMonitorSleepTime = conf.getLong(
TRIGGER_MONITOR_SLEEP_TIME_KEY,
SLEEP_TIME);
RaidNode.underRedundantFilesProcessorSleepTime = conf.getLong(
UNDER_REDUNDANT_FILES_PROCESSOR_SLEEP_TIME_KEY,
SLEEP_TIME);
this.triggerMonitor = new TriggerMonitor();
this.triggerThread = new Daemon(this.triggerMonitor);
this.triggerThread.setName("Trigger Thread");
this.triggerThread.start();
this.urfProcessor = new UnderRedundantFilesProcessor(conf);
this.urfThread = new Daemon(this.urfProcessor);
this.urfThread.setName("UnderRedundantFilesProcessor Thread");
this.urfThread.start();
// start the thread that monitor and moves blocks
this.placementMonitor = new PlacementMonitor(conf);
this.placementMonitor.start();
// start the thread that deletes obsolete parity files
this.purgeMonitor = new PurgeMonitor(conf, placementMonitor, this);
this.purgeThread = new Daemon(purgeMonitor);
this.purgeThread.setName("Purge Thread");
this.purgeThread.start();
// start the thread that creates HAR files
this.harThread = new Daemon(new HarMonitor());
this.harThread.setName("HAR Thread");
this.harThread.start();
// start the thread that collects statistics
this.statsCollector = new StatisticsCollector(this, configMgr, conf);
this.statsCollectorThread = new Daemon(statsCollector);
this.statsCollectorThread.setName("Stats Collector");
this.statsCollectorThread.start();
this.directoryTraversalShuffle =
conf.getBoolean(RAID_DIRECTORYTRAVERSAL_SHUFFLE, true);
this.directoryTraversalThreads =
conf.getInt(RAID_DIRECTORYTRAVERSAL_THREADS, 4);
startHttpServer();
this.registerMBean();
initialized = true;
}
public void registerMBean() {
StandardMBean bean;
try {
beanName = VersionInfo.registerJMX("RaidNode");
bean = new StandardMBean(this, RaidNodeStatusMBean.class);
raidnodeMXBeanName = MBeanUtil.registerMBean("RaidNode", "RaidNodeState", bean);
} catch (NotCompliantMBeanException e) {
e.printStackTrace();
}
LOG.info("Registered RaidNodeStatusMBean");
}
public void unregisterMBean() {
if (this.raidnodeMXBeanName != null) {
MBeanUtil.unregisterMBean(raidnodeMXBeanName);
}
if (this.beanName != null) {
MBeanUtil.unregisterMBean(beanName);
}
LOG.info("Unregistered RaidNodeStatusMBean");
}
private void startHttpServer() throws IOException {
String infoAddr = conf.get("mapred.raid.http.address", "localhost:50091");
InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
this.infoBindAddress = infoSocAddr.getAddress().getHostAddress();
int tmpInfoPort = infoSocAddr.getPort();
this.infoServer = new HttpServer("raid", this.infoBindAddress, tmpInfoPort,
tmpInfoPort == 0, conf);
this.infoServer.setAttribute("raidnode", this);
this.infoServer.addInternalServlet("corruptfilecounter", "/corruptfilecounter",
CorruptFileCounterServlet.class);
this.infoServer.start();
LOG.info("Web server started at port " + this.infoServer.getPort());
}
public StatisticsCollector getStatsCollector() {
return this.statsCollector;
}
public HttpServer getInfoServer() {
return infoServer;
}
public PlacementMonitor getPlacementMonitor() {
return this.placementMonitor;
}
public TriggerMonitor getTriggerMonitor() {
return this.triggerMonitor;
}
public UnderRedundantFilesProcessor getURFProcessor() {
return this.urfProcessor;
}
public PurgeMonitor getPurgeMonitor() {
return this.purgeMonitor;
}
public BlockIntegrityMonitor getBlockIntegrityMonitor() {
return blockIntegrityMonitor;
}
public BlockIntegrityMonitor.Status getBlockIntegrityMonitorStatus() {
return blockIntegrityMonitor.getAggregateStatus();
}
public BlockIntegrityMonitor.Status getBlockFixerStatus() {
return ((Worker)blockIntegrityMonitor.getCorruptionMonitor()).getStatus();
}
public BlockIntegrityMonitor.Status getBlockCopierStatus() {
return ((Worker)blockIntegrityMonitor.getDecommissioningMonitor()).getStatus();
}
public double getNumDetectionsPerSec() {
return ((CorruptionWorker)blockIntegrityMonitor.getCorruptionMonitor()).
getNumDetectionsPerSec();
}
public ArrayList<CorruptFile> getCorruptFileList(String monitorDir,
CorruptFileStatus cfs) {
return ((CorruptionWorker)blockIntegrityMonitor.getCorruptionMonitor()).
getCorruptFileList(monitorDir, cfs);
}
// Return the counter map where key is the check directories,
// the value is counters of different types of corrupt files
public Map<String, Map<CorruptFileStatus, Long>> getCorruptFilesCounterMap() {
return ((CorruptionWorker)blockIntegrityMonitor.getCorruptionMonitor()).
getCorruptFilesCounterMap();
}
public long getNumFilesWithMissingBlks() {
return ((CorruptFileCounter)blockIntegrityMonitor.
getCorruptFileCounter()).getFilesWithMissingBlksCnt();
}
public long[] getNumStrpWithMissingBlksRS(){
return ((CorruptFileCounter)blockIntegrityMonitor.
getCorruptFileCounter()).getNumStrpWithMissingBlksRS();
}
public CorruptFileCounter getCorruptFileCounter() {
return (CorruptFileCounter) blockIntegrityMonitor.getCorruptFileCounter();
}
public String getHostName() {
return this.infoBindAddress;
}
public long getStartTime() {
return this.startTime;
}
public Thread.State getStatsCollectorState() {
return this.statsCollectorThread.getState();
}
public Configuration getConf() {
return this.conf;
}
/**
* Determine a PolicyInfo from the codec, to re-generate the parity files
* of modified source files.
*
* @param codec
* @return
*/
public PolicyInfo determinePolicy(Codec codec) {
for (PolicyInfo info : configMgr.getAllPolicies()) {
if (!info.getShouldRaid()) {
continue;
}
if (info.getCodecId().equals(codec.id)) {
return info;
}
}
return null;
}
/**
* Implement RaidProtocol methods
*/
/** {@inheritDoc} */
public PolicyInfo[] getAllPolicies() throws IOException {
Collection<PolicyInfo> list = configMgr.getAllPolicies();
return list.toArray(new PolicyInfo[list.size()]);
}
/** {@inheritDoc} */
public String recoverFile(String inStr, long corruptOffset) throws IOException {
throw new IOException("Not supported");
}
/** {@inheritDoc} */
public void sendRecoveryTime(String path, long recoveryTime, String taskId)
throws IOException {
this.blockIntegrityMonitor.sendRecoveryTime(path, recoveryTime, taskId);
}
public boolean startSmokeTest() throws Exception {
return startSmokeTest(true);
}
public boolean startSmokeTest(boolean wait) throws Exception {
Runnable worker = this.blockIntegrityMonitor.getCorruptionMonitor();
if (worker == null || !(worker instanceof CorruptionWorker)) {
throw new IOException("CorruptionWorker is not found");
}
if (!(this instanceof DistRaidNode)) {
throw new IOException("Current Raid daemon is not DistRaidNode");
}
if (!(this.blockIntegrityMonitor instanceof DistBlockIntegrityMonitor)) {
throw new IOException("Current BlockFix daemon is not DistBlockIntegrityMonitor");
}
SmokeTestThread.LOG.info("[SMOKETEST] Start Raid Smoke Test");
long startTime = System.currentTimeMillis();
SmokeTestThread str = new SmokeTestThread(this);
ExecutorService executor = Executors.newSingleThreadExecutor();
Future<Boolean> future = executor.submit(str);
boolean result = false;
if (wait) {
try {
result = future.get(1200, TimeUnit.SECONDS);
} catch (Throwable exp) {
SmokeTestThread.LOG.info("[SMOKETEST] Get Exception ", exp);
} finally {
executor.shutdownNow();
SmokeTestThread.LOG.info("[SMOKETEST] Finish Raid Smoke Test (" +
(result? "succeed": "fail") + ") using " +
(System.currentTimeMillis() - startTime) + "ms");
if (str.ioe != null) {
throw str.ioe;
}
}
}
return result;
}
/**
* returns the number of raid jobs running for a particular policy
*/
abstract int getRunningJobsForPolicy(String policyName);
class IncreaseReplicationRunnable implements Runnable {
List<String> files = null;
FileSystem fs = null;
AtomicLong failFilesCount = null;
AtomicLong succeedFilesCount = null;
IncreaseReplicationRunnable(List<String> newFiles, FileSystem newFs,
AtomicLong newFailFilesCount, AtomicLong newSucceedFilesCount) {
files = newFiles;
fs = newFs;
failFilesCount = newFailFilesCount;
succeedFilesCount = newSucceedFilesCount;
}
public void run() {
short repl = 3;
int failCount = 0;
int succeedCount = 0;
try {
for (String file: files) {
FileStatus stat = null;
Path p = new Path(file);
try {
stat = fs.getFileStatus(p);
} catch (FileNotFoundException fnfe) {
// File doesn't exist, skip it
continue;
}
if (stat.isDir() || stat.getReplication() >= repl) {
// It's a directory or it already has enough replication, skip it
continue;
}
if (!fs.setReplication(new Path(file), repl)) {
failCount++;
LOG.warn("Fail to increase replication for " + file);
} else {
succeedCount++;
}
}
} catch (Throwable th) {
LOG.error("Fail to increase replication", th);
} finally {
failFilesCount.addAndGet(failCount);
succeedFilesCount.addAndGet(succeedCount);
}
}
}
public class UnderRedundantFilesProcessor implements Runnable {
public static final String UNDER_REDUNDANT_FILES_PROCESSOR_THREADS_NUM_KEY =
"raid.under.redundant.files.processor.threads.num";
public static final String INCREASE_REPLICATION_BATCH_SIZE_KEY =
"raid.increase.replication.batch.size";
public static final int DEFAULT_UNDER_REDUNDANT_FILES_PROCESSOR_THREADS_NUM = 5;
public static final int DEFAULT_INCREASE_REPLICATION_BATCH_SIZE = 50;
int numThreads = DEFAULT_UNDER_REDUNDANT_FILES_PROCESSOR_THREADS_NUM;
int incReplBatch = DEFAULT_INCREASE_REPLICATION_BATCH_SIZE;
long lastFileModificationTime = 0;
AtomicLong failedFilesCount = new AtomicLong(0);
AtomicLong succeedFilesCount = new AtomicLong(0);
String[] monitorDirs = null;
int[] counters = null;
int others = 0;
public UnderRedundantFilesProcessor(Configuration conf) {
numThreads = conf.getInt(UNDER_REDUNDANT_FILES_PROCESSOR_THREADS_NUM_KEY,
DEFAULT_UNDER_REDUNDANT_FILES_PROCESSOR_THREADS_NUM);
incReplBatch = conf.getInt(INCREASE_REPLICATION_BATCH_SIZE_KEY,
DEFAULT_INCREASE_REPLICATION_BATCH_SIZE);
}
public void processUnderRedundantFiles(ExecutorService executor, int counterLen)
throws IOException {
String underRedundantFile = conf.get(RAID_UNDER_REDUNDANT_FILES);
if (underRedundantFile == null) {
return;
}
Path fileListPath = new Path(underRedundantFile);
BufferedReader fileListReader;
final FileSystem fs = fileListPath.getFileSystem(conf);
FileStatus stat = null;
try {
stat = fs.getFileStatus(fileListPath);
if (stat.isDir() || stat.getModificationTime() == lastFileModificationTime) {
// Skip directory and already-scan files
return;
}
} catch (FileNotFoundException fnfe) {
return;
}
try {
InputStream in = fs.open(fileListPath);
fileListReader = new BufferedReader(new InputStreamReader(in));
} catch (IOException e) {
LOG.warn("Could not create reader for " + fileListPath, e);
return;
}
int[] newCounters = new int[counterLen];
int newOthers = 0;
String l = null;
List<String> files = new ArrayList<String>();
try {
while ((l = fileListReader.readLine()) != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("checking file " + l);
}
for (Codec codec: Codec.getCodecs()) {
if (l.startsWith(codec.tmpParityDirectory)) {
// Skip tmp parity files
continue;
}
}
boolean match = false;
for (int i = 0; i < newCounters.length; i++) {
if (l.startsWith(monitorDirs[i])) {
newCounters[i]++;
match = true;
}
}
if (!match) {
newOthers++;
}
files.add(l);
if (files.size() == incReplBatch) {
Runnable work = new IncreaseReplicationRunnable(files, fs,
failedFilesCount, succeedFilesCount);
executor.submit(work);
files = new ArrayList<String>();
}
}
if (files.size() > 0) {
Runnable work = new IncreaseReplicationRunnable(files, fs,
failedFilesCount, succeedFilesCount);
executor.submit(work);
}
counters = newCounters;
others = newOthers;
lastFileModificationTime = stat.getModificationTime();
} catch (IOException e) {
LOG.error("Encountered error in processUnderRedundantFiles", e);
}
}
public void run() {
RaidNodeMetrics rnm = RaidNodeMetrics.getInstance(
RaidNodeMetrics.DEFAULT_NAMESPACE_ID);
rnm.initUnderRedundantFilesMetrics(conf);
ExecutorService executor = null;
ThreadFactory factory = new ThreadFactory() {
final AtomicInteger tnum = new AtomicInteger();
public Thread newThread(Runnable r) {
Thread t = new Thread(r);
t.setName("IncReplication-" + tnum.incrementAndGet());
return t;
}
};
executor = Executors.newFixedThreadPool(numThreads, factory);
monitorDirs = BlockIntegrityMonitor.getCorruptMonitorDirs(conf);
while (running) {
LOG.info("Start process UnderRedundantFiles");
try {
processUnderRedundantFiles(executor, monitorDirs.length);
LOG.info("Update UnderRedundantFiles Metrics");
if (counters != null) {
for (int i = 0; i < counters.length; i++) {
rnm.underRedundantFiles.get(monitorDirs[i]).set(counters[i]);
}
rnm.underRedundantFiles.get(BlockIntegrityMonitor.OTHERS).set(others);
}
} catch (Throwable e) {
LOG.error(e);
}
try {
Thread.sleep(RaidNode.underRedundantFilesProcessorSleepTime);
} catch (InterruptedException ie) {
break;
}
}
executor.shutdown();
}
}
/**
* Periodically checks to see which policies should be fired.
*/
class TriggerMonitor implements Runnable {
class PolicyState {
long startTime = 0;
// A policy may specify either a path for directory traversal
// or a file with the list of files to raid.
DirectoryTraversal pendingTraversal = null;
BufferedReader fileListReader = null;
PolicyState() {}
boolean isFileListReadInProgress() {
return fileListReader != null;
}
void resetFileListRead() throws IOException {
if (fileListReader != null) {
fileListReader.close();
fileListReader = null;
}
}
boolean isScanInProgress() {
return pendingTraversal != null;
}
void resetTraversal() {
pendingTraversal = null;
}
void setTraversal(DirectoryTraversal pendingTraversal) {
this.pendingTraversal = pendingTraversal;
}
}
private Map<String, PolicyState> policyStateMap =
new HashMap<String, PolicyState>();
private volatile long lastTriggerTime = 0;
public long getLastTriggerTime() {
return lastTriggerTime;
}
// only used for testing
public void putPolicyInfo(PolicyInfo info) {
if (!policyStateMap.containsKey(info.getName())) {
policyStateMap.put(info.getName(), new PolicyState());
}
}
public void run() {
while (running) {
try {
doProcess();
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
} finally {
LOG.info("Trigger thread continuing to run...");
}
}
}
private boolean shouldReadFileList(PolicyInfo info) {
if (info.getFileListPath() == null || !info.getShouldRaid()) {
return false;
}
String policyName = info.getName();
PolicyState scanState = policyStateMap.get(policyName);
if (scanState.isFileListReadInProgress()) {
int maxJobsPerPolicy = configMgr.getMaxJobsPerPolicy();
int runningJobsCount = getRunningJobsForPolicy(policyName);
// If there is a scan in progress for this policy, we can have
// upto maxJobsPerPolicy running jobs.
return (runningJobsCount < maxJobsPerPolicy);
} else {
long lastReadStart = scanState.startTime;
return (now() > lastReadStart + configMgr.getPeriodicity());
}
}
/**
* Should we select more files for a policy.
*/
private boolean shouldSelectFiles(PolicyInfo info) {
if (!info.getShouldRaid()) {
return false;
}
String policyName = info.getName();
int runningJobsCount = getRunningJobsForPolicy(policyName);
PolicyState scanState = policyStateMap.get(policyName);
if (scanState.isScanInProgress()) {
int maxJobsPerPolicy = configMgr.getMaxJobsPerPolicy();
// If there is a scan in progress for this policy, we can have
// upto maxJobsPerPolicy running jobs.
return (runningJobsCount < maxJobsPerPolicy);
} else {
// Check the time of the last full traversal before starting a fresh
// traversal.
long lastScan = scanState.startTime;
return (now() > lastScan + configMgr.getPeriodicity());
}
}
/**
* Returns a list of pathnames that needs raiding.
* The list of paths could be obtained by resuming a previously suspended
* traversal.
* The number of paths returned is limited by raid.distraid.max.jobs.
*/
private List<FileStatus> selectFiles(
PolicyInfo info, ArrayList<PolicyInfo> allPolicies) throws IOException {
String policyName = info.getName();
// Max number of files returned.
int selectLimit = configMgr.getMaxFilesPerJob();
PolicyState scanState = policyStateMap.get(policyName);
List<FileStatus> returnSet = new ArrayList<FileStatus>(selectLimit);
DirectoryTraversal traversal;
if (scanState.isScanInProgress()) {
LOG.info("Resuming traversal for policy " + policyName);
traversal = scanState.pendingTraversal;
} else {
LOG.info("Start new traversal for policy " + policyName);
scanState.startTime = now();
if (!Codec.getCodec(info.getCodecId()).isDirRaid) {
traversal = DirectoryTraversal.raidFileRetriever(
info, info.getSrcPathExpanded(), allPolicies, conf,
directoryTraversalThreads, directoryTraversalShuffle,
true);
} else {
traversal = DirectoryTraversal.raidLeafDirectoryRetriever(
info, info.getSrcPathExpanded(), allPolicies, conf,
directoryTraversalThreads, directoryTraversalShuffle,
true);
}
scanState.setTraversal(traversal);
}
FileStatus f;
while ((f = traversal.next()) != DirectoryTraversal.FINISH_TOKEN) {
returnSet.add(f);
if (returnSet.size() == selectLimit) {
return returnSet;
}
}
scanState.resetTraversal();
return returnSet;
}
public List<FileStatus> readFileList(PolicyInfo info) throws IOException {
Path fileListPath = info.getFileListPath();
List<FileStatus> list = new ArrayList<FileStatus>();
long dirRaidNumBlocks = 0L;
if (fileListPath == null) {
return list;
}
int targetReplication = Integer.parseInt(info.getProperty("targetReplication"));
String policyName = info.getName();
PolicyState scanState = policyStateMap.get(policyName);
if (!scanState.isFileListReadInProgress()) {
scanState.startTime = now();
try {
InputStream in = fileListPath.getFileSystem(conf).open(fileListPath);
scanState.fileListReader = new BufferedReader(new InputStreamReader(in));
} catch (IOException e) {
LOG.warn("Could not create reader for " + fileListPath, e);
return list;
}
}
Codec codec = Codec.getCodec(info.getCodecId());
// Max number of blocks/files returned.
int selectLimit = codec.isDirRaid? configMgr.getMaxBlocksPerDirRaidJob():
configMgr.getMaxFilesPerJob();
String l = null;
try {
while ((l = scanState.fileListReader.readLine()) != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Select files to raid, check: " + l);
}
Path p = new Path(l);
FileSystem fs = p.getFileSystem(conf);
p = fs.makeQualified(p);
FileStatus stat = null;
try {
stat = ParityFilePair.FileStatusCache.get(fs, p);
} catch (FileNotFoundException e) {
LOG.warn("Path " + p + " does not exist", e);
}
if (stat == null) {
continue;
}
short repl = 0;
List<FileStatus> lfs = null;
if (codec.isDirRaid) {
if (!stat.isDir()) {
continue;
}
lfs = RaidNode.listDirectoryRaidFileStatus(conf, fs, p);
if (lfs == null) {
continue;
}
repl = DirectoryStripeReader.getReplication(lfs);
} else {
repl = stat.getReplication();
}
// if should not raid, will not put the file into the write list.
if (!RaidNode.shouldRaid(conf, fs, stat, codec, lfs)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Should not raid file: " + l);
}
continue;
}
// check the replication.
if ((repl > targetReplication) ||
(repl == targetReplication &&
!ParityFilePair.parityExists(stat, codec, conf))) {
list.add(stat);
if (codec.isDirRaid) {
dirRaidNumBlocks += DirectoryStripeReader.getBlockNum(lfs);;
}
}
// for dir-raid, we judge from number of blocks rather than
// that of directories
if (codec.isDirRaid && dirRaidNumBlocks >= selectLimit ||
!codec.isDirRaid && list.size() >= selectLimit) {
break;
}
}
if (l == null) {
scanState.resetFileListRead();
}
} catch (IOException e) {
LOG.error("Encountered error in file list read ", e);
scanState.resetFileListRead();
}
return list;
}
/**
* Keep processing policies.
* If the config file has changed, then reload config file and start afresh.
*/
private void doProcess() throws IOException, InterruptedException {
ArrayList<PolicyInfo> allPolicies = new ArrayList<PolicyInfo>();
ArrayList<PolicyInfo> allPoliciesWithSrcPath = new ArrayList<PolicyInfo>();
for (PolicyInfo info : configMgr.getAllPolicies()) {
allPolicies.add(info);
if (info.getSrcPath() != null) {
allPoliciesWithSrcPath.add(info);
}
}
while (running) {
Thread.sleep(RaidNode.triggerMonitorSleepTime);
boolean reloaded = configMgr.reloadConfigsIfNecessary();
if (reloaded) {
allPolicies.clear();
allPoliciesWithSrcPath.clear();
for (PolicyInfo info : configMgr.getAllPolicies()) {
allPolicies.add(info);
if (info.getSrcPath() != null) {
allPoliciesWithSrcPath.add(info);
}
}
}
LOG.info("TriggerMonitor.doProcess " + allPolicies.size());
for (PolicyInfo info: allPolicies) {
this.putPolicyInfo(info);
List<FileStatus> filteredPaths = null;
if (shouldReadFileList(info)) {
filteredPaths = readFileList(info);
} else if (shouldSelectFiles(info)) {
LOG.info("Triggering Policy Filter " + info.getName() +
" " + info.getSrcPath());
try {
filteredPaths = selectFiles(info, allPoliciesWithSrcPath);
} catch (Exception e) {
LOG.info("Exception while invoking filter on policy " + info.getName() +
" srcPath " + info.getSrcPath() +
" exception " + StringUtils.stringifyException(e));
continue;
}
} else {
continue;
}
if (filteredPaths == null || filteredPaths.size() == 0) {
LOG.info("No filtered paths for policy " + info.getName());
continue;
}
// Apply the action on accepted paths
LOG.info("Triggering Policy Action " + info.getName() +
" " + filteredPaths.size() + " files");
try {
raidFiles(info, filteredPaths);
} catch (Throwable e) {
LOG.info("Exception while invoking action on policy " + info.getName() +
" srcPath " + info.getSrcPath() +
" exception " + StringUtils.stringifyException(e), e);
continue;
}
}
lastTriggerTime = System.currentTimeMillis();
}
}
}
/**
* raid a list of files, this will be overridden by subclasses of RaidNode
*/
abstract void raidFiles(PolicyInfo info, List<FileStatus> paths) throws IOException;
public abstract String raidJobsHtmlTable(JobMonitor.STATUS st);
static Path getOriginalParityFile(Path destPathPrefix, Path srcPath) {
return (srcPath == null || srcPath.getParent() == null)? destPathPrefix:
new Path(destPathPrefix, makeRelative(srcPath));
}
public static long numBlocks(FileStatus stat) {
return (long) Math.ceil(stat.getLen() * 1.0 / stat.getBlockSize());
}
public static long numStripes(long numBlocks, int stripeSize) {
return (long) Math.ceil(numBlocks * 1.0 / stripeSize);
}
static long savingFromRaidingFile(
EncodingCandidate ec, int stripeSize, int paritySize,
int targetReplication, int parityReplication) {
if (ec.startStripe != 0)
return 0;
FileStatus stat = ec.srcStat;
long currentReplication = stat.getReplication();
if (currentReplication > targetReplication) {
long numBlocks = numBlocks(stat);
long numStripes = numStripes(numBlocks, stripeSize);
long sourceSaving =
stat.getLen() * (currentReplication - targetReplication);
long parityBlocks = numStripes * paritySize;
return sourceSaving - parityBlocks * parityReplication * stat.getBlockSize();
}
return 0;
}
static public List<EncodingCandidate> splitPaths(Configuration conf,
Codec codec, FileStatus path) throws IOException {
List<FileStatus> lfs = new ArrayList<FileStatus>();
lfs.add(path);
return splitPaths(conf, codec, lfs);
}
static public List<EncodingCandidate> splitPaths(Configuration conf,
Codec codec, List<FileStatus> paths) throws IOException {
List<EncodingCandidate> lec = new ArrayList<EncodingCandidate>();
long encodingUnit = conf.getLong(RAID_ENCODING_STRIPES_KEY,
DEFAULT_RAID_ENCODING_STRIPES);
FileSystem srcFs = FileSystem.get(conf);
for (FileStatus s : paths) {
if (codec.isDirRaid != s.isDir()) {
continue;
}
long numBlocks = 0L;
if (codec.isDirRaid) {
List<FileStatus> lfs = RaidNode.listDirectoryRaidFileStatus(
conf, srcFs, s.getPath());
if (lfs == null) {
continue;
}
for (FileStatus stat : lfs) {
numBlocks += RaidNode.numBlocks(stat);
}
} else {
numBlocks = RaidNode.numBlocks(s);
}
long numStripes = RaidNode.numStripes(numBlocks, codec.stripeLength);
String encodingId = System.currentTimeMillis() + "." + rand.nextLong();
for (long startStripe = 0; startStripe < numStripes;
startStripe += encodingUnit) {
lec.add(new EncodingCandidate(s, startStripe, encodingId, encodingUnit,
s.getModificationTime()));
}
}
return lec;
}
/**
* RAID a list of files / directories
* @throws InterruptedException
*/
void doRaid(Configuration conf, PolicyInfo info, List<EncodingCandidate> paths)
throws IOException {
int targetRepl = Integer.parseInt(info.getProperty("targetReplication"));
int metaRepl = Integer.parseInt(info.getProperty("metaReplication"));
Codec codec = Codec.getCodec(info.getCodecId());
Path destPref = new Path(codec.parityDirectory);
String simulate = info.getProperty("simulate");
boolean doSimulate = simulate == null ? false : Boolean
.parseBoolean(simulate);
Statistics statistics = new Statistics();
int count = 0;
for (EncodingCandidate ec : paths) {
doRaid(conf, ec, destPref, codec, statistics,
RaidUtils.NULL_PROGRESSABLE, doSimulate,
targetRepl, metaRepl);
if (count % 1000 == 0) {
LOG.info("RAID statistics " + statistics.toString());
}
count++;
}
LOG.info("RAID statistics " + statistics.toString());
}
static public boolean doRaid(Configuration conf, PolicyInfo info,
FileStatus src, Statistics statistics, Progressable reporter)
throws IOException {
List<EncodingCandidate> lec =
splitPaths(conf, Codec.getCodec(info.getCodecId()), src);
boolean succeed = false;
for (EncodingCandidate ec: lec) {
succeed = succeed || doRaid(conf, info, ec, statistics, reporter);
}
return succeed;
}
/**
* RAID an individual file/directory
* @throws InterruptedException
*/
static public boolean doRaid(Configuration conf, PolicyInfo info,
EncodingCandidate src, Statistics statistics,
Progressable reporter)
throws IOException {
int targetRepl = Integer.parseInt(info.getProperty("targetReplication"));
int metaRepl = Integer.parseInt(info.getProperty("metaReplication"));
Codec codec = Codec.getCodec(info.getCodecId());
Path destPref = new Path(codec.parityDirectory);
String simulate = info.getProperty("simulate");
boolean doSimulate = simulate == null ? false : Boolean
.parseBoolean(simulate);
return doRaid(conf, src, destPref, codec, statistics,
reporter, doSimulate, targetRepl, metaRepl);
}
public static List<FileStatus> listDirectoryRaidFileStatus(Configuration conf,
FileSystem srcFs, Path p) throws IOException {
long minFileSize = conf.getLong(MINIMUM_RAIDABLE_FILESIZE_KEY,
MINIMUM_RAIDABLE_FILESIZE);
List<FileStatus> lfs = new ArrayList<FileStatus>();
FileStatus[] files = srcFs.listStatus(p);
if (null == files) {
return null;
}
for (FileStatus stat : files) {
if (stat.isDir()) {
return null;
}
// We don't raid too small files
if (stat.getLen() < minFileSize) {
continue;
}
lfs.add(stat);
}
if (lfs.size() == 0)
return null;
return lfs;
}
public static List<LocatedFileStatus> listDirectoryRaidLocatedFileStatus(
Configuration conf, FileSystem srcFs, Path p) throws IOException {
long minFileSize = conf.getLong(MINIMUM_RAIDABLE_FILESIZE_KEY,
MINIMUM_RAIDABLE_FILESIZE);
List<LocatedFileStatus> lfs = new ArrayList<LocatedFileStatus>();
RemoteIterator<LocatedFileStatus> iter = srcFs.listLocatedStatus(p);
while (iter.hasNext()) {
LocatedFileStatus stat = iter.next();
if (stat.isDir()) {
return null;
}
// We don't raid too small files
if (stat.getLen() < minFileSize) {
continue;
}
lfs.add(stat);
}
if (lfs.size() == 0)
return null;
return lfs;
}
// only used by test
public static boolean doRaid(Configuration conf, FileStatus stat,
Path destPath, Codec codec, Statistics statistics,
Progressable reporter, boolean doSimulate, int targetRepl, int metaRepl)
throws IOException {
boolean succeed = false;
for (EncodingCandidate ec : RaidNode.splitPaths(conf, codec, stat)) {
succeed = succeed || doRaid(conf, ec, destPath, codec, statistics,
reporter, doSimulate, targetRepl, metaRepl);
}
return succeed;
}
public static boolean doRaid(Configuration conf, EncodingCandidate ec,
Path destPath, Codec codec, Statistics statistics,
Progressable reporter, boolean doSimulate, int targetRepl, int metaRepl)
throws IOException {
long startTime = System.currentTimeMillis();
LOGRESULTS result = LOGRESULTS.FAILURE;
Throwable ex = null;
try {
if (codec.isDirRaid) {
result = doDirRaid(conf, ec, destPath, codec, statistics, reporter,
doSimulate, targetRepl, metaRepl);
} else {
result = doFileRaid(conf, ec, destPath, codec, statistics, reporter,
doSimulate, targetRepl, metaRepl);
}
return result == LOGRESULTS.SUCCESS;
} catch (IOException ioe) {
ex = ioe;
throw ioe;
} catch (InterruptedException e) {
ex = e;
throw new IOException(e);
} finally {
long delay = System.currentTimeMillis() - startTime;
long savingBytes = statistics.processedSize -
statistics.remainingSize - statistics.metaSize;
FileStatus stat = ec.srcStat;
FileSystem srcFs = stat.getPath().getFileSystem(conf);
if (result != LOGRESULTS.NOACTION) {
LogUtils.logRaidEncodingMetrics(result, codec,
delay, statistics.processedSize, statistics.numProcessedBlocks,
statistics.numMetaBlocks, statistics.metaSize,
savingBytes, stat.getPath(), LOGTYPES.ENCODING, srcFs, ex,
reporter);
}
}
}
/**
* check if the file is already raided by high priority codec
*/
public static boolean raidedByOtherHighPriCodec(Configuration conf,
FileStatus stat, Codec codec) throws IOException {
for (Codec tcodec : Codec.getCodecs()) {
if (tcodec.priority > codec.priority) {
if (stat.isDir() && !tcodec.isDirRaid) {
// A directory could not be raided by a file level codec.
continue;
}
// check if high priority parity file exists.
if (ParityFilePair.parityExists(stat, tcodec, conf)) {
InjectionHandler.processEvent(InjectionEvent.RAID_ENCODING_SKIP_PATH);
return true;
}
}
}
return false;
}
private static boolean tooNewForRaid(FileStatus stat) {
if (System.currentTimeMillis() - stat.getModificationTime() < modTimePeriod) {
InjectionHandler.processEvent(InjectionEvent.RAID_ENCODING_SKIP_PATH_TOO_NEW_MOD);
if (LOG.isDebugEnabled()) {
LOG.debug("Skip file: " + stat.getPath() +
" with too new modification time: " + stat.getModificationTime());
}
return true;
}
return false;
}
/**
* Decide whether a file/directory is too small to raid.
*/
public static boolean shouldRaid(Configuration conf, FileSystem srcFs,
FileStatus stat, Codec codec, List<FileStatus> lfs) throws IOException {
Path p = stat.getPath();
long blockNum = 0L;
if (stat.isDir() != codec.isDirRaid) {
return false;
}
if (tooNewForRaid(stat)) {
return false;
}
blockNum = codec.isDirRaid ?
DirectoryStripeReader.getBlockNum(lfs) : numBlocks(stat);
// if the file/directory has fewer than 2 blocks, then nothing to do
if (blockNum <= RaidState.TOO_SMALL_NOT_RAID_NUM_BLOCKS) {
return false;
}
return !raidedByOtherHighPriCodec(conf, stat, codec);
}
public static boolean shouldRaid(Configuration conf, FileSystem srcFs,
FileStatus stat, Codec codec) throws IOException {
Path p = stat.getPath();
if (stat.isDir() != codec.isDirRaid) {
return false;
}
if (tooNewForRaid(stat)) {
return false;
}
List<FileStatus> lfs = null;
if (codec.isDirRaid) {
// add up the total number of blocks in the directory
lfs = RaidNode.listDirectoryRaidFileStatus(conf, srcFs, p);
if (null == lfs) {
return false;
}
}
return shouldRaid(conf, srcFs, stat, codec, lfs);
}
public static long getNumBlocks(FileStatus stat) {
long numBlocks = stat.getLen() / stat.getBlockSize();
if (stat.getLen() % stat.getBlockSize() == 0)
return numBlocks;
else
return numBlocks + 1;
}
/**
* RAID an individual directory
* @throws InterruptedException
*/
private static LOGRESULTS doDirRaid(Configuration conf, EncodingCandidate ec,
Path destPath, Codec codec, Statistics statistics,
Progressable reporter, boolean doSimulate, int targetRepl, int metaRepl)
throws IOException {
FileStatus stat = ec.srcStat;
Path p = stat.getPath();
FileSystem srcFs = p.getFileSystem(conf);
List<FileStatus> lfs = RaidNode.listDirectoryRaidFileStatus(conf, srcFs, p);
if (lfs == null) {
return LOGRESULTS.NOACTION;
}
// add up the total number of blocks in the directory
long blockNum = DirectoryStripeReader.getBlockNum(lfs);
// if the directory has fewer than 2 blocks, then nothing to do
if (blockNum <= 2) {
return LOGRESULTS.NOACTION;
}
// add up the raw disk space occupied by this directory
long diskSpace = 0;
// we use the maximum replication
int srcRepl = 0;
for (FileStatus fsStat: lfs) {
diskSpace += (fsStat.getLen() * fsStat.getReplication());
if (fsStat.getReplication() > srcRepl) {
srcRepl = fsStat.getReplication();
}
}
long parityBlockSize = DirectoryStripeReader.getParityBlockSize(conf, lfs);
statistics.numProcessedBlocks += blockNum;
statistics.processedSize += diskSpace;
boolean parityGenerated = false;
// generate parity file
try {
parityGenerated = generateParityFile(conf, ec, targetRepl, reporter, srcFs,
destPath, codec, blockNum, srcRepl, metaRepl, parityBlockSize, lfs);
} catch (InterruptedException e) {
throw new IOException (e);
}
if (!parityGenerated)
return LOGRESULTS.NOACTION;
if (!doSimulate) {
for (FileStatus fsStat: lfs) {
if (srcFs.setReplication(fsStat.getPath(), (short)targetRepl) == false) {
LOG.info("Error in reducing replication of " +
fsStat.getPath() + " to " + targetRepl);
statistics.remainingSize += diskSpace;
return LOGRESULTS.FAILURE;
}
};
}
diskSpace = 0;
for (FileStatus fsStat: lfs) {
diskSpace += (fsStat.getLen() * targetRepl);
}
statistics.remainingSize += diskSpace;
// the metafile will have this many number of blocks
long numMeta = blockNum / codec.stripeLength;
if (blockNum % codec.stripeLength != 0) {
numMeta++;
}
// we create numMeta blocks. This metablock has metaRepl # replicas.
// the last block of the metafile might not be completely filled up, but we
// ignore that for now.
statistics.numMetaBlocks += (numMeta * metaRepl);
statistics.metaSize += (numMeta * metaRepl * parityBlockSize);
return LOGRESULTS.SUCCESS;
}
/**
* RAID an individual file
* @throws InterruptedException
*/
private static LOGRESULTS doFileRaid(Configuration conf, EncodingCandidate ec,
Path destPath, Codec codec, Statistics statistics,
Progressable reporter, boolean doSimulate, int targetRepl, int metaRepl)
throws IOException, InterruptedException {
FileStatus stat = ec.srcStat;
Path p = stat.getPath();
FileSystem srcFs = p.getFileSystem(conf);
// extract block locations from File system
BlockLocation[] locations = srcFs.getFileBlockLocations(stat, 0, stat.getLen());
// if the file has fewer than 2 blocks, then nothing to do
if (locations.length <= 2) {
return LOGRESULTS.NOACTION;
}
// add up the raw disk space occupied by this file
long diskSpace = 0;
for (BlockLocation l: locations) {
diskSpace += (l.getLength() * stat.getReplication());
}
statistics.numProcessedBlocks += locations.length;
statistics.processedSize += diskSpace;
// generate parity file
boolean parityGenerated = generateParityFile(conf, ec, targetRepl, reporter,
srcFs, destPath, codec, locations.length, stat.getReplication(),
metaRepl, stat.getBlockSize(), null);
if (!parityGenerated) {
return LOGRESULTS.NOACTION;
}
if (!doSimulate) {
if (srcFs.setReplication(p, (short)targetRepl) == false) {
LOG.info("Error in reducing replication of " + p + " to " + targetRepl);
statistics.remainingSize += diskSpace;
return LOGRESULTS.FAILURE;
};
}
diskSpace = 0;
for (BlockLocation l: locations) {
diskSpace += (l.getLength() * targetRepl);
}
statistics.remainingSize += diskSpace;
// the metafile will have this many number of blocks
int numMeta = locations.length / codec.stripeLength;
if (locations.length % codec.stripeLength != 0) {
numMeta++;
}
// we create numMeta for every file. This metablock has metaRepl # replicas.
// the last block of the metafile might not be completely filled up, but we
// ignore that for now.
statistics.numMetaBlocks += (numMeta * metaRepl);
statistics.metaSize += (numMeta * metaRepl * stat.getBlockSize());
return LOGRESULTS.SUCCESS;
}
/**
* Generate parity file
* @throws InterruptedException
*/
static private boolean generateParityFile(Configuration conf, EncodingCandidate ec,
int targetRepl,
Progressable reporter,
FileSystem inFs,
Path destPathPrefix,
Codec codec,
long blockNum,
int srcRepl,
int metaRepl,
long blockSize,
List<FileStatus> lfs)
throws IOException, InterruptedException {
FileStatus stat = ec.srcStat;
Path inpath = stat.getPath();
Path outpath = getOriginalParityFile(destPathPrefix, inpath);
FileSystem outFs = inFs;
// If the parity file is already upto-date and source replication is set
// then nothing to do.
try {
FileStatus stmp = outFs.getFileStatus(outpath);
if (stmp.getModificationTime() == stat.getModificationTime() &&
srcRepl == targetRepl) {
LOG.info("Parity file for " + inpath + "(" + blockNum +
") is " + outpath + " already upto-date and " +
"file is at target replication . Nothing more to do.");
return false;
}
} catch (IOException e) {
// ignore errors because the raid file might not exist yet.
}
Encoder encoder = new Encoder(conf, codec);
encoder.verifyStore();
StripeReader sReader = null;
boolean parityGenerated = false;
if (codec.isDirRaid) {
long numStripes = (blockNum % codec.stripeLength == 0) ?
(blockNum / codec.stripeLength) :
((blockNum / codec.stripeLength) + 1);
sReader = new DirectoryStripeReader(conf, codec, inFs,
ec.startStripe, ec.encodingUnit, inpath, lfs);
parityGenerated = encoder.encodeFile(conf, inFs, outFs, outpath,
(short)metaRepl, numStripes, blockSize, reporter, sReader, ec);
} else {
FileStatus srcStat = inFs.getFileStatus(inpath);
long srcSize = srcStat.getLen();
long numBlocks = (srcSize % blockSize == 0) ?
(srcSize / blockSize) :
((srcSize / blockSize) + 1);
long numStripes = (numBlocks % codec.stripeLength == 0) ?
(numBlocks / codec.stripeLength) :
((numBlocks / codec.stripeLength) + 1);
sReader = new FileStripeReader(conf, blockSize,
codec, inFs, ec.startStripe, ec.encodingUnit, inpath, srcSize);
parityGenerated = encoder.encodeFile(conf, inFs, outFs, outpath,
(short)metaRepl, numStripes, blockSize, reporter, sReader, ec);
}
if (!parityGenerated) {
return false;
}
// set the modification time of the RAID file. This is done so that the modTime of the
// RAID file reflects that contents of the source file that it has RAIDed. This should
// also work for files that are being appended to. This is necessary because the time on
// on the destination namenode may not be synchronised with the timestamp of the
// source namenode.
outFs.setTimes(outpath, stat.getModificationTime(), -1);
FileStatus outstat = outFs.getFileStatus(outpath);
FileStatus inStat = inFs.getFileStatus(inpath);
if (stat.getModificationTime() != inStat.getModificationTime()) {
String msg = "Source file changed mtime during raiding from " +
stat.getModificationTime() + " to " + inStat.getModificationTime();
throw new IOException(msg);
}
if (outstat.getModificationTime() != inStat.getModificationTime()) {
String msg = "Parity file mtime " + outstat.getModificationTime() +
" does not match source mtime " + inStat.getModificationTime();
throw new IOException(msg);
}
LOG.info("Source file " + inpath + " of size " + inStat.getLen() +
" Parity file " + outpath + " of size " + outstat.getLen() +
" src mtime " + stat.getModificationTime() +
" parity mtime " + outstat.getModificationTime());
return true;
}
public static DecoderInputStream unRaidCorruptInputStream(Configuration conf,
Path srcPath, Codec codec, ParityFilePair parityFilePair,
Block lostBlock,
long blockSize,
long corruptOffset,
long limit,
boolean useStripeStore)
throws IOException {
boolean recoverFromStripeInfo = false;
StripeInfo si = null;
FileSystem srcFs = srcPath.getFileSystem(conf);
Decoder decoder = new Decoder(conf, codec);
// Test if parity file exists
if (parityFilePair == null) {
if (codec.isDirRaid && useStripeStore) {
recoverFromStripeInfo = true;
decoder.connectToStore(srcPath);
si = decoder.retrieveStripe(lostBlock, srcPath, -1, srcFs, null, true);
if (si == null) {
LOG.warn("Could not find " + codec.id + " parity file for " + srcPath
+ ", including the stripe store");
return null;
}
} else {
LOG.warn("Could not find " + codec.id + " parity file for " + srcPath
+ " without stripe store");
return null;
}
}
return decoder.generateAlternateStream(srcFs, srcPath,
parityFilePair == null ? srcFs : parityFilePair.getFileSystem(),
parityFilePair == null ? null : parityFilePair.getPath(),
blockSize, corruptOffset, limit, lostBlock, si, recoverFromStripeInfo, null);
}
private void doHar() throws IOException, InterruptedException {
long prevExec = 0;
while (running) {
// The config may be reloaded by the TriggerMonitor.
// This thread uses whatever config is currently active.
while(now() < prevExec + configMgr.getPeriodicity()){
Thread.sleep(SLEEP_TIME);
}
LOG.info("Started archive scan");
prevExec = now();
// fetch all categories
for (Codec codec : Codec.getCodecs()) {
if (codec.isDirRaid) {
// Disable har for directory raid
continue;
}
try {
String tmpHarPath = codec.tmpHarDirectory;
int harThresold = conf.getInt(RAID_PARITY_HAR_THRESHOLD_DAYS_KEY,
DEFAULT_RAID_PARITY_HAR_THRESHOLD_DAYS);
long cutoff = now() - ( harThresold * 24L * 3600000L );
Path destPref = new Path(codec.parityDirectory);
FileSystem destFs = destPref.getFileSystem(conf);
FileStatus destStat = null;
try {
destStat = destFs.getFileStatus(destPref);
} catch (FileNotFoundException e) {
continue;
}
LOG.info("Haring parity files in " + destPref);
recurseHar(codec, destFs, destStat, destPref.toUri().getPath(),
destFs, cutoff, tmpHarPath);
} catch (Exception e) {
LOG.warn("Ignoring Exception while haring ", e);
}
}
}
return;
}
void recurseHar(Codec codec, FileSystem destFs, FileStatus dest,
String destPrefix, FileSystem srcFs, long cutoff, String tmpHarPath)
throws IOException {
if (!dest.isDir()) {
return;
}
Path destPath = dest.getPath(); // pathname, no host:port
String destStr = destPath.toUri().getPath();
// If the source directory is a HAR, do nothing.
if (destStr.endsWith(".har")) {
return;
}
// Verify if it already contains a HAR directory
if ( destFs.exists(new Path(destPath, destPath.getName()+HAR_SUFFIX)) ) {
return;
}
boolean shouldHar = false;
FileStatus[] files = destFs.listStatus(destPath);
long harBlockSize = -1;
short harReplication = -1;
if (files != null) {
shouldHar = files.length > 0;
for (FileStatus one: files) {
if (one.isDir()){
recurseHar(codec, destFs, one, destPrefix, srcFs, cutoff, tmpHarPath);
shouldHar = false;
} else if (one.getModificationTime() > cutoff ) {
if (shouldHar) {
LOG.debug("Cannot archive " + destPath +
" because " + one.getPath() + " was modified after cutoff");
shouldHar = false;
}
} else {
if (harBlockSize == -1) {
harBlockSize = one.getBlockSize();
} else if (harBlockSize != one.getBlockSize()) {
LOG.info("Block size of " + one.getPath() + " is " +
one.getBlockSize() + " which is different from " + harBlockSize);
shouldHar = false;
}
if (harReplication == -1) {
harReplication = one.getReplication();
} else if (harReplication != one.getReplication()) {
LOG.info("Replication of " + one.getPath() + " is " +
one.getReplication() + " which is different from " + harReplication);
shouldHar = false;
}
}
}
if (shouldHar) {
String src = destStr.replaceFirst(destPrefix, "");
Path srcPath = new Path(src);
FileStatus[] statuses = srcFs.listStatus(srcPath);
Path destPathPrefix = new Path(destPrefix).makeQualified(destFs);
if (statuses != null) {
for (FileStatus status : statuses) {
if (ParityFilePair.getParityFile(codec,
status, conf) == null ) {
LOG.debug("Cannot archive " + destPath +
" because it doesn't contain parity file for " +
status.getPath().makeQualified(srcFs) + " on destination " +
destPathPrefix);
shouldHar = false;
break;
}
}
}
}
}
if ( shouldHar ) {
LOG.info("Archiving " + dest.getPath() + " to " + tmpHarPath );
singleHar(codec, destFs, dest, tmpHarPath, harBlockSize, harReplication);
}
}
private void singleHar(Codec codec, FileSystem destFs,
FileStatus dest, String tmpHarPath, long harBlockSize,
short harReplication) throws IOException {
Random rand = new Random();
Path root = new Path("/");
Path qualifiedPath = dest.getPath().makeQualified(destFs);
String harFileDst = qualifiedPath.getName() + HAR_SUFFIX;
String harFileSrc = qualifiedPath.getName() + "-" +
rand.nextLong() + "-" + HAR_SUFFIX;
// HadoopArchives.HAR_PARTFILE_LABEL is private, so hard-coding the label.
conf.setLong("har.partfile.size", configMgr.getHarPartfileSize());
conf.setLong("har.block.size", harBlockSize);
HadoopArchives har = new HadoopArchives(conf);
String[] args = new String[7];
args[0] = "-Ddfs.replication=" + harReplication;
args[1] = "-archiveName";
args[2] = harFileSrc;
args[3] = "-p";
args[4] = root.makeQualified(destFs).toString();
args[5] = qualifiedPath.toUri().getPath().substring(1);
args[6] = tmpHarPath.toString();
int ret = 0;
Path tmpHar = new Path(tmpHarPath + "/" + harFileSrc);
try {
ret = ToolRunner.run(har, args);
if (ret == 0 && !destFs.rename(tmpHar,
new Path(qualifiedPath, harFileDst))) {
LOG.info("HAR rename didn't succeed from " + tmpHarPath+"/"+harFileSrc +
" to " + qualifiedPath + "/" + harFileDst);
ret = -2;
}
} catch (Exception exc) {
throw new IOException("Error while creating archive " + ret, exc);
} finally {
destFs.delete(tmpHar, true);
}
if (ret != 0){
throw new IOException("Error while creating archive " + ret);
}
return;
}
/**
* Periodically generates HAR files
*/
class HarMonitor implements Runnable {
public void run() {
while (running) {
try {
doHar();
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
} finally {
LOG.info("Har parity files thread continuing to run...");
}
}
LOG.info("Leaving Har thread.");
}
}
static boolean isParityHarPartFile(Path p) {
Matcher m = PARITY_HAR_PARTFILE_PATTERN.matcher(p.toUri().getPath());
return m.matches();
}
/**
* Returns current time.
*/
static long now() {
return System.currentTimeMillis();
}
/**
* Make an absolute path relative by stripping the leading /
*/
static Path makeRelative(Path path) {
if (!path.isAbsolute()) {
return path;
}
String p = path.toUri().getPath();
String relative = p.substring(1, p.length());
return new Path(relative);
}
private static void printUsage() {
System.err.println("Usage: java RaidNode ");
}
private static StartupOption parseArguments(String args[]) {
StartupOption startOpt = StartupOption.REGULAR;
return startOpt;
}
/**
* Convert command line options to configuration parameters
*/
private static void setStartupOption(Configuration conf, StartupOption opt) {
conf.set("fs.raidnode.startup", opt.toString());
}
/**
* Create an instance of the appropriate subclass of RaidNode
*/
public static RaidNode createRaidNode(Configuration conf)
throws ClassNotFoundException {
try {
// default to distributed raid node
Class<?> raidNodeClass =
conf.getClass(RAIDNODE_CLASSNAME_KEY, DistRaidNode.class);
if (!RaidNode.class.isAssignableFrom(raidNodeClass)) {
throw new ClassNotFoundException("not an implementation of RaidNode");
}
Constructor<?> constructor =
raidNodeClass.getConstructor(new Class[] {Configuration.class} );
return (RaidNode) constructor.newInstance(conf);
} catch (NoSuchMethodException e) {
throw new ClassNotFoundException("cannot construct raidnode", e);
} catch (InstantiationException e) {
throw new ClassNotFoundException("cannot construct raidnode", e);
} catch (IllegalAccessException e) {
throw new ClassNotFoundException("cannot construct raidnode", e);
} catch (InvocationTargetException e) {
throw new ClassNotFoundException("cannot construct raidnode", e);
}
}
static public ChecksumStore createChecksumStore(Configuration conf,
boolean createStore) {
Class<?> checksumStoreClass = null;
checksumStoreClass = conf.getClass(RAID_CHECKSUM_STORE_CLASS_KEY, null);
if (checksumStoreClass == null) {
return null;
}
ChecksumStore checksumStore = (ChecksumStore) ReflectionUtils.newInstance(
checksumStoreClass, conf);
try {
checksumStore.initialize(conf, createStore);
} catch (IOException ioe) {
LOG.error("Fail to initialize checksum store", ioe);
checksumStore = null;
}
return checksumStore;
}
static public StripeStore createStripeStore(Configuration conf
, boolean createStore, FileSystem fs) {
Class<?> stripeStoreClass = null;
stripeStoreClass = conf.getClass(RAID_STRIPE_STORE_CLASS_KEY, null);
if (stripeStoreClass == null) {
return null;
}
StripeStore stripeStore = (StripeStore) ReflectionUtils.newInstance(
stripeStoreClass, conf);
try {
stripeStore.initialize(conf, createStore, fs);
} catch (IOException ioe) {
LOG.error("Fail to initialize stripe store", ioe);
stripeStore = null;
}
return stripeStore;
}
/**
* Create an instance of the RaidNode
*/
public static RaidNode createRaidNode(String argv[], Configuration conf)
throws IOException, ClassNotFoundException {
if (conf == null) {
conf = new Configuration();
}
StartupOption startOpt = parseArguments(argv);
if (startOpt == null) {
printUsage();
return null;
}
setStartupOption(conf, startOpt);
RaidNode node = createRaidNode(conf);
return node;
}
/**
* Get the job id from the configuration
*/
public static String getJobID(Configuration conf) {
String jobId = conf.get("mapred.job.id", null);
if (jobId == null) {
jobId = "localRaid" + df.format(new Date());
conf.set("mapred.job.id", jobId);
}
return jobId;
}
public String getReadReconstructionMetricsUrl() {
return configMgr.getReadReconstructionMetricsUrl();
}
@Override //RaidNodeStatusMBean
public long getTimeSinceLastSuccessfulFix() {
return this.blockIntegrityMonitor.getTimeSinceLastSuccessfulFix();
}
@Override //RaidNodeStatusMBean
public long getNumUnderRedundantFilesFailedIncreaseReplication() {
return this.urfProcessor.failedFilesCount.get();
}
@Override //RaidNodeStatusMBean
public long getNumUnderRedundantFilesSucceededIncreaseReplication() {
return this.urfProcessor.succeedFilesCount.get();
}
public static void main(String argv[]) throws Exception {
try {
StringUtils.startupShutdownMessage(RaidNode.class, argv, LOG);
RaidNode raid = createRaidNode(argv, null);
if (raid != null) {
raid.join();
}
} catch (Throwable e) {
LOG.error(StringUtils.stringifyException(e));
System.exit(-1);
}
}
}