/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.TaskLog;
import org.apache.hadoop.mapred.TaskLog.LogName;
import org.apache.hadoop.mapred.TaskLog.LogFileDetail;
import org.apache.hadoop.util.StringUtils;
class TaskLogsMonitor extends Thread {
static final Log LOG = LogFactory.getLog(TaskLogsMonitor.class);
long mapRetainSize, reduceRetainSize;
public TaskLogsMonitor(long mapRetSize, long reduceRetSize) {
mapRetainSize = mapRetSize;
reduceRetainSize = reduceRetSize;
LOG.info("Starting logs' monitor with mapRetainSize=" + mapRetainSize
+ " and reduceRetainSize=" + reduceRetSize);
}
/**
* The list of tasks that have finished and so need their logs to be
* truncated.
*/
private Map<TaskAttemptID, PerJVMInfo> finishedJVMs =
new HashMap<TaskAttemptID, PerJVMInfo>();
private static final int DEFAULT_BUFFER_SIZE = 4 * 1024;
static final int MINIMUM_RETAIN_SIZE_FOR_TRUNCATION = 0;
private static class PerJVMInfo {
List<Task> allAttempts;
public PerJVMInfo(List<Task> allAtmpts) {
this.allAttempts = allAtmpts;
}
}
/**
* Process(JVM/debug script) has finished. Asynchronously truncate the logs of
* all the corresponding tasks to the configured limit. In case of JVM, both
* the firstAttempt as well as the list of all attempts that ran in the same
* JVM have to be passed. For debug script, the (only) attempt itself should
* be passed as both the firstAttempt as well as the list of attempts.
*
* @param firstAttempt
* @param isTaskCleanup
*/
void addProcessForLogTruncation(TaskAttemptID firstAttempt,
List<Task> allAttempts) {
LOG.info("Adding the jvm with first-attempt " + firstAttempt
+ " for logs' truncation");
PerJVMInfo lInfo = new PerJVMInfo(allAttempts);
synchronized (finishedJVMs) {
finishedJVMs.put(firstAttempt, lInfo);
finishedJVMs.notify();
}
}
/**
* Process the removed task's logs. This involves truncating them to
* retainSize.
*/
void truncateLogs(TaskAttemptID firstAttempt, PerJVMInfo lInfo) {
// Read the log-file details for all the attempts that ran in this JVM
Map<Task, Map<LogName, LogFileDetail>> taskLogFileDetails;
try {
taskLogFileDetails = getAllLogsFileDetails(lInfo.allAttempts);
} catch (IOException e) {
LOG.warn(
"Exception in truncateLogs while getting allLogsFileDetails()."
+ " Ignoring the truncation of logs of this process.", e);
return;
}
Map<Task, Map<LogName, LogFileDetail>> updatedTaskLogFileDetails =
new HashMap<Task, Map<LogName, LogFileDetail>>();
File attemptLogDir = TaskLog.getBaseDir(firstAttempt.toString());
FileWriter tmpFileWriter;
FileReader logFileReader;
// Now truncate file by file
logNameLoop: for (LogName logName : LogName.values()) {
File logFile = TaskLog.getTaskLogFile(firstAttempt, logName);
if (!logFile.exists()) {
continue;
}
// //// Optimization: if no task is over limit, just skip truncation-code
if (logFile.exists()
&& !isTruncationNeeded(lInfo, taskLogFileDetails, logName)) {
LOG.debug("Truncation is not needed for "
+ logFile.getAbsolutePath());
continue;
}
// //// End of optimization
// Truncation is needed for this log-file. Go ahead now.
File tmpFile = new File(attemptLogDir, "truncate.tmp");
try {
tmpFileWriter = new FileWriter(tmpFile);
} catch (IOException ioe) {
LOG.warn("Cannot open " + tmpFile.getAbsolutePath()
+ " for writing truncated log-file "
+ logFile.getAbsolutePath()
+ ". Continuing with other log files. ", ioe);
continue;
}
try {
logFileReader = new FileReader(logFile);
} catch (FileNotFoundException fe) {
LOG.warn("Cannot open " + logFile.getAbsolutePath()
+ " for reading. Continuing with other log files");
if (!tmpFile.delete()) {
LOG.warn("Cannot delete tmpFile " + tmpFile.getAbsolutePath());
}
continue;
}
long newCurrentOffset = 0;
// Process each attempt from the ordered list passed.
for (Task task : lInfo.allAttempts) {
// Truncate the log files of this task-attempt so that only the last
// retainSize many bytes of this log file is retained and the log
// file is reduced in size saving disk space.
long retainSize =
(task.isMapTask() ? mapRetainSize : reduceRetainSize);
LogFileDetail newLogFileDetail = new LogFileDetail();
try {
newLogFileDetail =
truncateALogFileOfAnAttempt(task.getTaskID(),
taskLogFileDetails.get(task).get(logName), retainSize,
tmpFileWriter, logFileReader);
} catch (IOException ioe) {
LOG.warn("Cannot truncate the log file "
+ logFile.getAbsolutePath()
+ ". Caught exception while handling " + task.getTaskID(),
ioe);
// revert back updatedTaskLogFileDetails
revertIndexFileInfo(lInfo, taskLogFileDetails,
updatedTaskLogFileDetails, logName);
if (!tmpFile.delete()) {
LOG.warn("Cannot delete tmpFile " + tmpFile.getAbsolutePath());
}
continue logNameLoop;
}
// Track information for updating the index file properly.
// Index files don't track DEBUGOUT and PROFILE logs, so skip'em.
if (TaskLog.LOGS_TRACKED_BY_INDEX_FILES.contains(logName)) {
if (!updatedTaskLogFileDetails.containsKey(task)) {
updatedTaskLogFileDetails.put(task,
new HashMap<LogName, LogFileDetail>());
}
// newLogFileDetail already has the location and length set, just
// set the start offset now.
newLogFileDetail.start = newCurrentOffset;
updatedTaskLogFileDetails.get(task).put(logName, newLogFileDetail);
newCurrentOffset += newLogFileDetail.length;
}
}
try {
tmpFileWriter.close();
} catch (IOException ioe) {
LOG.warn("Couldn't close the tmp file " + tmpFile.getAbsolutePath()
+ ". Deleting it.", ioe);
revertIndexFileInfo(lInfo, taskLogFileDetails,
updatedTaskLogFileDetails, logName);
if (!tmpFile.delete()) {
LOG.warn("Cannot delete tmpFile " + tmpFile.getAbsolutePath());
}
continue;
}
if (!tmpFile.renameTo(logFile)) {
// If the tmpFile cannot be renamed revert back
// updatedTaskLogFileDetails to maintain the consistency of the
// original log file
revertIndexFileInfo(lInfo, taskLogFileDetails,
updatedTaskLogFileDetails, logName);
if (!tmpFile.delete()) {
LOG.warn("Cannot delete tmpFile " + tmpFile.getAbsolutePath());
}
}
}
// Update the index files
updateIndicesAfterLogTruncation(firstAttempt, updatedTaskLogFileDetails);
}
/**
* @param lInfo
* @param taskLogFileDetails
* @param updatedTaskLogFileDetails
* @param logName
*/
private void revertIndexFileInfo(PerJVMInfo lInfo,
Map<Task, Map<LogName, LogFileDetail>> taskLogFileDetails,
Map<Task, Map<LogName, LogFileDetail>> updatedTaskLogFileDetails,
LogName logName) {
if (TaskLog.LOGS_TRACKED_BY_INDEX_FILES.contains(logName)) {
for (Task task : lInfo.allAttempts) {
if (!updatedTaskLogFileDetails.containsKey(task)) {
updatedTaskLogFileDetails.put(task,
new HashMap<LogName, LogFileDetail>());
}
updatedTaskLogFileDetails.get(task).put(logName,
taskLogFileDetails.get(task).get(logName));
}
}
}
/**
* Get the logFileDetails of all the list of attempts passed.
*
* @param lInfo
* @return a map of task to the log-file detail
* @throws IOException
*/
private Map<Task, Map<LogName, LogFileDetail>> getAllLogsFileDetails(
final List<Task> allAttempts) throws IOException {
Map<Task, Map<LogName, LogFileDetail>> taskLogFileDetails =
new HashMap<Task, Map<LogName, LogFileDetail>>();
for (Task task : allAttempts) {
Map<LogName, LogFileDetail> allLogsFileDetails;
allLogsFileDetails =
TaskLog.getAllLogsFileDetails(task.getTaskID(),
task.isTaskCleanupTask());
taskLogFileDetails.put(task, allLogsFileDetails);
}
return taskLogFileDetails;
}
/**
* Check if truncation of logs is needed for the given jvmInfo. If all the
* tasks that ran in a JVM are within the log-limits, then truncation is not
* needed. Otherwise it is needed.
*
* @param lInfo
* @param taskLogFileDetails
* @param logName
* @return true if truncation is needed, false otherwise
*/
private boolean isTruncationNeeded(PerJVMInfo lInfo,
Map<Task, Map<LogName, LogFileDetail>> taskLogFileDetails,
LogName logName) {
boolean truncationNeeded = false;
LogFileDetail logFileDetail = null;
for (Task task : lInfo.allAttempts) {
long taskRetainSize =
(task.isMapTask() ? mapRetainSize : reduceRetainSize);
Map<LogName, LogFileDetail> allLogsFileDetails =
taskLogFileDetails.get(task);
logFileDetail = allLogsFileDetails.get(logName);
if (taskRetainSize > MINIMUM_RETAIN_SIZE_FOR_TRUNCATION
&& logFileDetail.length > taskRetainSize) {
truncationNeeded = true;
break;
}
}
return truncationNeeded;
}
/**
* Truncate the log file of this task-attempt so that only the last retainSize
* many bytes of each log file is retained and the log file is reduced in size
* saving disk space.
*
* @param taskID Task whose logs need to be truncated
* @param oldLogFileDetail contains the original log details for the attempt
* @param taskRetainSize retain-size
* @param tmpFileWriter New log file to write to. Already opened in append
* mode.
* @param logFileReader Original log file to read from.
* @return
* @throws IOException
*/
private LogFileDetail truncateALogFileOfAnAttempt(
final TaskAttemptID taskID, final LogFileDetail oldLogFileDetail,
final long taskRetainSize, final FileWriter tmpFileWriter,
final FileReader logFileReader) throws IOException {
LogFileDetail newLogFileDetail = new LogFileDetail();
// ///////////// Truncate log file ///////////////////////
// New location of log file is same as the old
newLogFileDetail.location = oldLogFileDetail.location;
if (taskRetainSize > MINIMUM_RETAIN_SIZE_FOR_TRUNCATION
&& oldLogFileDetail.length > taskRetainSize) {
LOG.info("Truncating logs for " + taskID + " from "
+ oldLogFileDetail.length + "bytes to " + taskRetainSize
+ "bytes.");
newLogFileDetail.length = taskRetainSize;
} else {
LOG.info("No truncation needed for " + taskID + " length is "
+ oldLogFileDetail.length + " retain size " + taskRetainSize
+ "bytes.");
newLogFileDetail.length = oldLogFileDetail.length;
}
long charsSkipped =
logFileReader.skip(oldLogFileDetail.length
- newLogFileDetail.length);
if (charsSkipped != oldLogFileDetail.length - newLogFileDetail.length) {
throw new IOException("Erroneously skipped " + charsSkipped
+ " instead of the expected "
+ (oldLogFileDetail.length - newLogFileDetail.length));
}
long alreadyRead = 0;
while (alreadyRead < newLogFileDetail.length) {
char tmpBuf[]; // Temporary buffer to read logs
if (newLogFileDetail.length - alreadyRead >= DEFAULT_BUFFER_SIZE) {
tmpBuf = new char[DEFAULT_BUFFER_SIZE];
} else {
tmpBuf = new char[(int) (newLogFileDetail.length - alreadyRead)];
}
int bytesRead = logFileReader.read(tmpBuf);
if (bytesRead < 0) {
break;
} else {
alreadyRead += bytesRead;
}
tmpFileWriter.write(tmpBuf);
}
// ////// End of truncating log file ///////////////////////
return newLogFileDetail;
}
/**
* Truncation of logs is done. Now sync the index files to reflect the
* truncated sizes.
*
* @param firstAttempt
* @param updatedTaskLogFileDetails
*/
private void updateIndicesAfterLogTruncation(TaskAttemptID firstAttempt,
Map<Task, Map<LogName, LogFileDetail>> updatedTaskLogFileDetails) {
for (Entry<Task, Map<LogName, LogFileDetail>> entry :
updatedTaskLogFileDetails.entrySet()) {
Task task = entry.getKey();
Map<LogName, LogFileDetail> logFileDetails = entry.getValue();
Map<LogName, Long[]> logLengths = new HashMap<LogName, Long[]>();
// set current and previous lengths
for (LogName logName : TaskLog.LOGS_TRACKED_BY_INDEX_FILES) {
logLengths.put(logName, new Long[] { Long.valueOf(0L),
Long.valueOf(0L) });
LogFileDetail lfd = logFileDetails.get(logName);
if (lfd != null) {
// Set previous lengths
logLengths.get(logName)[0] = Long.valueOf(lfd.start);
// Set current lengths
logLengths.get(logName)[1] = Long.valueOf(lfd.start + lfd.length);
}
}
try {
TaskLog.writeToIndexFile(firstAttempt, task.getTaskID(),
task.isTaskCleanupTask(), logLengths);
} catch (IOException ioe) {
LOG.warn("Exception in updateIndicesAfterLogTruncation : "
+ StringUtils.stringifyException(ioe));
LOG.warn("Exception encountered while updating index file of task "
+ task.getTaskID()
+ ". Ignoring and continuing with other tasks.");
}
}
}
/**
*
* @throws IOException
*/
void monitorTaskLogs() throws IOException {
Map<TaskAttemptID, PerJVMInfo> tasksBeingTruncated =
new HashMap<TaskAttemptID, PerJVMInfo>();
// Start monitoring newly added finishedJVMs
synchronized (finishedJVMs) {
tasksBeingTruncated.clear();
tasksBeingTruncated.putAll(finishedJVMs);
finishedJVMs.clear();
}
for (Entry<TaskAttemptID, PerJVMInfo> entry :
tasksBeingTruncated.entrySet()) {
truncateLogs(entry.getKey(), entry.getValue());
}
}
@Override
public void run() {
while (true) {
try {
monitorTaskLogs();
try {
synchronized (finishedJVMs) {
while (finishedJVMs.isEmpty()) {
finishedJVMs.wait();
}
}
} catch (InterruptedException e) {
LOG.warn(getName() + " is interrupted. Returning");
return;
}
} catch (Throwable e) {
LOG.warn(getName()
+ " encountered an exception while monitoring : "
+ StringUtils.stringifyException(e));
LOG.info("Ingoring the exception and continuing monitoring.");
}
}
}
}