/*
* Copyright 2012 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.linkedin.whiteelephant.parsing;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.linkedin.whiteelephant.mapreduce.lib.input.CombineDocumentFileFormat;
import com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJob;
import com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJobExecutor;
import com.linkedin.whiteelephant.util.JobStatsProcessing;
public class ParseJobConfs
{
private final Logger _log;
private final Properties _props;
private final FileSystem _fs;
private final String _name;
private final String _confsOutputPathRoot;
private final String _logsRoot;
private final String _clusterNames;
private final int _numDays;
private final int _numDaysForced;
private final boolean _incremental;
public ParseJobConfs(String name, Properties props) throws IOException
{
_log = Logger.getLogger(name);
_name = name;
_props = props;
Configuration conf = StagedOutputJob.createConfigurationFromProps(_props);
System.out.println("fs.default.name: " + conf.get("fs.default.name"));
_fs = FileSystem.get(conf);
if (_props.get("cluster.names") == null) {
throw new IllegalArgumentException("cluster.names is not specified.");
}
if (_props.get("jobs.output.path") == null) {
throw new IllegalArgumentException("attempts.output.path is not specified.");
}
if (_props.get("num.days") == null) {
throw new IllegalArgumentException("num.days is not specified");
}
if (_props.get("num.days.forced") == null) {
throw new IllegalArgumentException("num.days.forced is not specified");
}
if (_props.get("incremental") == null) {
throw new IllegalArgumentException("incremental is not specified.");
}
if (_props.get("logs.root") == null) {
throw new IllegalArgumentException("logs.root is not specified.");
}
_confsOutputPathRoot = (String)_props.get("confs.output.path");
_logsRoot = (String)_props.get("logs.root");
_clusterNames = (String)_props.get("cluster.names");
_numDays = Integer.parseInt((String)_props.get("num.days"));
_numDaysForced = Integer.parseInt((String)_props.get("num.days.forced"));
_incremental = Boolean.parseBoolean((String)_props.get("incremental"));
}
public void execute(StagedOutputJobExecutor executor) throws IOException, InterruptedException, ExecutionException
{
for (String clusterName : _clusterNames.split(","))
{
System.out.println("Processing cluster " + clusterName);
List<JobStatsProcessing.ProcessingTask> processingTasks = JobStatsProcessing.getTasks(_fs, _logsRoot, clusterName, _confsOutputPathRoot, "xml", _incremental, _numDays, _numDaysForced);
for (JobStatsProcessing.ProcessingTask task : processingTasks)
{
List<String> inputPaths = new ArrayList<String>();
inputPaths.add(task.inputPathFormat);
String outputPath = task.outputPath;
final StagedOutputJob job = StagedOutputJob.createStagedJob(
_props,
_name + "-parse-confs-" + task.id,
inputPaths,
"/tmp" + outputPath,
outputPath,
_log);
job.getConfiguration().set("jobs.output.path", _confsOutputPathRoot);
job.getConfiguration().set("logs.cluster.name", clusterName);
job.setOutputKeyClass(BytesWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setInputFormatClass(CombineDocumentFileFormat.class);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
AvroJob.setOutputKeySchema(job, JobConf.SCHEMA$);
job.setNumReduceTasks(0);
job.setMapperClass(ParseJobConfs.TheMapper.class);
executor.submit(job);
}
executor.waitForCompletion();
}
}
public static class TheMapper extends Mapper<Text, BytesWritable, AvroWrapper<JobConf>, NullWritable>
{
private Logger _log = Logger.getLogger(TheMapper.class);
private static Pattern jobPattern = Pattern.compile("job_\\d+_\\d+");
String _clusterName;
private DocumentBuilder builder;
@Override
protected void setup(Context context)
{
_clusterName = context.getConfiguration().get("logs.cluster.name");
try {
builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
@Override
protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException
{
JobConf jobConf = new JobConf();
String filename = key.toString();
jobConf.setPath(filename);
Matcher jobMatcher = jobPattern.matcher(filename);
if (!jobMatcher.find()){
throw new RuntimeException("Expected to find jobId in the filename. Aborting");
}
String jobId = jobMatcher.group();
jobConf.setJobId(jobId);
jobConf.setCluster(_clusterName);
Map<CharSequence, CharSequence> conf = getConfigurationMap(value);
jobConf.setConfiguration(conf);
context.write(new AvroKey<JobConf>(jobConf), NullWritable.get());
}
private Map<CharSequence, CharSequence> getConfigurationMap(BytesWritable bytes) {
InputStream stream = new ByteArrayInputStream(bytes.getBytes(), 0, bytes.getLength());
Document doc = null;
try {
doc = builder.parse(stream);
} catch (SAXException e) {
e.printStackTrace();
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
}
NodeList children = doc.getElementsByTagName("configuration");
Element child = (Element) children.item(0);
NodeList properties = child.getElementsByTagName("property");
Map<CharSequence, CharSequence> conf = new HashMap<CharSequence, CharSequence>(properties.getLength());
for (int i = 0; i < properties.getLength(); i++)
{
Element property = (Element) properties.item(i);
String name = property.getElementsByTagName("name").item(0).getTextContent();
String value = property.getElementsByTagName("value").item(0).getTextContent();
conf.put(name, value);
}
return conf;
}
}
}