package com.linkedin.camus.etl.kafka.mapred;
import com.linkedin.camus.coders.CamusWrapper;
import com.linkedin.camus.coders.MessageDecoder;
import com.linkedin.camus.etl.kafka.CamusJob;
import com.linkedin.camus.etl.kafka.coders.KafkaAvroMessageDecoder;
import com.linkedin.camus.etl.kafka.coders.MessageDecoderFactory;
import com.linkedin.camus.etl.kafka.common.EtlKey;
import com.linkedin.camus.etl.kafka.common.EtlRequest;
import com.linkedin.camus.etl.kafka.common.LeaderInfo;
import com.linkedin.camus.workallocater.CamusRequest;
import com.linkedin.camus.workallocater.WorkAllocator;
import java.io.IOException;
import java.net.URI;
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import kafka.api.PartitionOffsetRequestInfo;
import kafka.common.ErrorMapping;
import kafka.common.TopicAndPartition;
import kafka.javaapi.OffsetRequest;
import kafka.javaapi.OffsetResponse;
import kafka.javaapi.PartitionMetadata;
import kafka.javaapi.TopicMetadata;
import kafka.javaapi.TopicMetadataRequest;
import kafka.javaapi.consumer.SimpleConsumer;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
/**
* Input format for a Kafka pull job.
*/
public class EtlInputFormat extends InputFormat<EtlKey, CamusWrapper> {
public static final String KAFKA_BLACKLIST_TOPIC = "kafka.blacklist.topics";
public static final String KAFKA_WHITELIST_TOPIC = "kafka.whitelist.topics";
public static final String KAFKA_MOVE_TO_LAST_OFFSET_LIST = "kafka.move.to.last.offset.list";
public static final String KAFKA_CLIENT_BUFFER_SIZE = "kafka.client.buffer.size";
public static final String KAFKA_CLIENT_SO_TIMEOUT = "kafka.client.so.timeout";
public static final String KAFKA_MAX_PULL_HRS = "kafka.max.pull.hrs";
public static final String KAFKA_MAX_PULL_MINUTES_PER_TASK = "kafka.max.pull.minutes.per.task";
public static final String KAFKA_MAX_HISTORICAL_DAYS = "kafka.max.historical.days";
public static final String CAMUS_MESSAGE_DECODER_CLASS = "camus.message.decoder.class";
public static final String ETL_IGNORE_SCHEMA_ERRORS = "etl.ignore.schema.errors";
public static final String ETL_AUDIT_IGNORE_SERVICE_TOPIC_LIST = "etl.audit.ignore.service.topic.list";
public static final String CAMUS_WORK_ALLOCATOR_CLASS = "camus.work.allocator.class";
public static final String CAMUS_WORK_ALLOCATOR_DEFAULT = "com.linkedin.camus.workallocater.BaseAllocator";
private static Logger log = null;
public EtlInputFormat()
{
if (log == null)
log = Logger.getLogger(getClass());
}
public static void setLogger(Logger log){
EtlInputFormat.log = log;
}
@Override
public RecordReader<EtlKey, CamusWrapper> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
return new EtlRecordReader(split, context);
}
/**
* Gets the metadata from Kafka
*
* @param context
* @return
*/
public List<TopicMetadata> getKafkaMetadata(JobContext context) {
ArrayList<String> metaRequestTopics = new ArrayList<String>();
CamusJob.startTiming("kafkaSetupTime");
String brokerString = CamusJob.getKafkaBrokers(context);
if (brokerString.isEmpty())
throw new InvalidParameterException("kafka.brokers must contain at least one node");
List<String> brokers = Arrays.asList(brokerString.split("\\s*,\\s*"));
Collections.shuffle(brokers);
boolean fetchMetaDataSucceeded = false;
int i = 0;
List<TopicMetadata> topicMetadataList = null;
Exception savedException = null;
while (i < brokers.size() && !fetchMetaDataSucceeded) {
SimpleConsumer consumer = createConsumer(context, brokers.get(i));
log.info(String.format("Fetching metadata from broker %s with client id %s for %d topic(s) %s",
brokers.get(i), consumer.clientId(), metaRequestTopics.size(), metaRequestTopics));
try {
topicMetadataList = consumer.send(new TopicMetadataRequest(metaRequestTopics)).topicsMetadata();
fetchMetaDataSucceeded = true;
} catch (Exception e) {
savedException = e;
log.warn(String.format("Fetching topic metadata with client id %s for topics [%s] from broker [%s] failed",
consumer.clientId(), metaRequestTopics, brokers.get(i)), e);
} finally {
consumer.close();
i++;
}
}
if (!fetchMetaDataSucceeded) {
throw new RuntimeException("Failed to obtain metadata!", savedException);
}
CamusJob.stopTiming("kafkaSetupTime");
return topicMetadataList;
}
private SimpleConsumer createConsumer(JobContext context, String broker) {
if (!broker.matches(".+:\\d+"))
throw new InvalidParameterException("The kakfa broker " + broker + " must follow address:port pattern");
String[] hostPort = broker.split(":");
SimpleConsumer consumer = new SimpleConsumer(
hostPort[0],
Integer.valueOf(hostPort[1]),
CamusJob.getKafkaTimeoutValue(context),
CamusJob.getKafkaBufferSize(context),
CamusJob.getKafkaClientName(context));
return consumer;
}
/**
* Gets the latest offsets and create the requests as needed
*
* @param context
* @param offsetRequestInfo
* @return
*/
public ArrayList<CamusRequest> fetchLatestOffsetAndCreateEtlRequests(
JobContext context,
HashMap<LeaderInfo, ArrayList<TopicAndPartition>> offsetRequestInfo) {
ArrayList<CamusRequest> finalRequests = new ArrayList<CamusRequest>();
for (LeaderInfo leader : offsetRequestInfo.keySet()) {
SimpleConsumer consumer = new SimpleConsumer(leader.getUri()
.getHost(), leader.getUri().getPort(),
CamusJob.getKafkaTimeoutValue(context),
CamusJob.getKafkaBufferSize(context),
CamusJob.getKafkaClientName(context));
// Latest Offset
PartitionOffsetRequestInfo partitionLatestOffsetRequestInfo = new PartitionOffsetRequestInfo(
kafka.api.OffsetRequest.LatestTime(), 1);
// Earliest Offset
PartitionOffsetRequestInfo partitionEarliestOffsetRequestInfo = new PartitionOffsetRequestInfo(
kafka.api.OffsetRequest.EarliestTime(), 1);
Map<TopicAndPartition, PartitionOffsetRequestInfo> latestOffsetInfo = new HashMap<TopicAndPartition, PartitionOffsetRequestInfo>();
Map<TopicAndPartition, PartitionOffsetRequestInfo> earliestOffsetInfo = new HashMap<TopicAndPartition, PartitionOffsetRequestInfo>();
ArrayList<TopicAndPartition> topicAndPartitions = offsetRequestInfo
.get(leader);
for (TopicAndPartition topicAndPartition : topicAndPartitions) {
latestOffsetInfo.put(topicAndPartition,
partitionLatestOffsetRequestInfo);
earliestOffsetInfo.put(topicAndPartition,
partitionEarliestOffsetRequestInfo);
}
OffsetResponse latestOffsetResponse = consumer
.getOffsetsBefore(new OffsetRequest(latestOffsetInfo,
kafka.api.OffsetRequest.CurrentVersion(), CamusJob
.getKafkaClientName(context)));
OffsetResponse earliestOffsetResponse = consumer
.getOffsetsBefore(new OffsetRequest(earliestOffsetInfo,
kafka.api.OffsetRequest.CurrentVersion(), CamusJob
.getKafkaClientName(context)));
consumer.close();
for (TopicAndPartition topicAndPartition : topicAndPartitions) {
long latestOffset = latestOffsetResponse.offsets(
topicAndPartition.topic(),
topicAndPartition.partition())[0];
long earliestOffset = earliestOffsetResponse.offsets(
topicAndPartition.topic(),
topicAndPartition.partition())[0];
//TODO: factor out kafka specific request functionality
CamusRequest etlRequest = new EtlRequest(context,
topicAndPartition.topic(), Integer.toString(leader
.getLeaderId()), topicAndPartition.partition(),
leader.getUri());
etlRequest.setLatestOffset(latestOffset);
etlRequest.setEarliestOffset(earliestOffset);
finalRequests.add(etlRequest);
}
}
return finalRequests;
}
public String createTopicRegEx(HashSet<String> topicsSet) {
String regex = "";
StringBuilder stringbuilder = new StringBuilder();
for (String whiteList : topicsSet) {
stringbuilder.append(whiteList);
stringbuilder.append("|");
}
regex = "(" + stringbuilder.substring(0, stringbuilder.length() - 1)
+ ")";
Pattern.compile(regex);
return regex;
}
public List<TopicMetadata> filterWhitelistTopics(
List<TopicMetadata> topicMetadataList,
HashSet<String> whiteListTopics) {
ArrayList<TopicMetadata> filteredTopics = new ArrayList<TopicMetadata>();
String regex = createTopicRegEx(whiteListTopics);
for (TopicMetadata topicMetadata : topicMetadataList) {
if (Pattern.matches(regex, topicMetadata.topic())) {
filteredTopics.add(topicMetadata);
} else {
log.info("Discarding topic : " + topicMetadata.topic());
}
}
return filteredTopics;
}
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
CamusJob.startTiming("getSplits");
ArrayList<CamusRequest> finalRequests;
HashMap<LeaderInfo, ArrayList<TopicAndPartition>> offsetRequestInfo = new HashMap<LeaderInfo, ArrayList<TopicAndPartition>>();
try {
// Get Metadata for all topics
List<TopicMetadata> topicMetadataList = getKafkaMetadata(context);
// Filter any white list topics
HashSet<String> whiteListTopics = new HashSet<String>(
Arrays.asList(getKafkaWhitelistTopic(context)));
if (!whiteListTopics.isEmpty()) {
topicMetadataList = filterWhitelistTopics(topicMetadataList,
whiteListTopics);
}
// Filter all blacklist topics
HashSet<String> blackListTopics = new HashSet<String>(
Arrays.asList(getKafkaBlacklistTopic(context)));
String regex = "";
if (!blackListTopics.isEmpty()) {
regex = createTopicRegEx(blackListTopics);
}
for (TopicMetadata topicMetadata : topicMetadataList) {
if (Pattern.matches(regex, topicMetadata.topic())) {
log.info("Discarding topic (blacklisted): "
+ topicMetadata.topic());
} else if (!createMessageDecoder(context, topicMetadata.topic())) {
log.info("Discarding topic (Decoder generation failed) : "
+ topicMetadata.topic());
} else if (topicMetadata.errorCode() != ErrorMapping.NoError()) {
log.info("Skipping the creation of ETL request for Whole Topic : "
+ topicMetadata.topic()
+ " Exception : "
+ ErrorMapping
.exceptionFor(topicMetadata
.errorCode()));
} else {
for (PartitionMetadata partitionMetadata : topicMetadata
.partitionsMetadata()) {
// We only care about LeaderNotAvailableCode error on partitionMetadata level
// Error codes such as ReplicaNotAvailableCode should not stop us.
if (partitionMetadata.errorCode() == ErrorMapping.LeaderNotAvailableCode()) {
log.info("Skipping the creation of ETL request for Topic : "
+ topicMetadata.topic()
+ " and Partition : "
+ partitionMetadata.partitionId()
+ " Exception : "
+ ErrorMapping
.exceptionFor(partitionMetadata
.errorCode()));
} else {
if (partitionMetadata.errorCode() != ErrorMapping.NoError()) {
log.warn("Receiving non-fatal error code, Continuing the creation of ETL request for Topic : "
+ topicMetadata.topic()
+ " and Partition : "
+ partitionMetadata.partitionId()
+ " Exception : "
+ ErrorMapping
.exceptionFor(partitionMetadata
.errorCode()));
}
LeaderInfo leader = new LeaderInfo(new URI("tcp://"
+ partitionMetadata.leader()
.getConnectionString()),
partitionMetadata.leader().id());
if (offsetRequestInfo.containsKey(leader)) {
ArrayList<TopicAndPartition> topicAndPartitions = offsetRequestInfo
.get(leader);
topicAndPartitions.add(new TopicAndPartition(
topicMetadata.topic(),
partitionMetadata.partitionId()));
offsetRequestInfo.put(leader,
topicAndPartitions);
} else {
ArrayList<TopicAndPartition> topicAndPartitions = new ArrayList<TopicAndPartition>();
topicAndPartitions.add(new TopicAndPartition(
topicMetadata.topic(),
partitionMetadata.partitionId()));
offsetRequestInfo.put(leader,
topicAndPartitions);
}
}
}
}
}
} catch (Exception e) {
log.error(
"Unable to pull requests from Kafka brokers. Exiting the program",
e);
return null;
}
// Get the latest offsets and generate the EtlRequests
finalRequests = fetchLatestOffsetAndCreateEtlRequests(context,
offsetRequestInfo);
Collections.sort(finalRequests, new Comparator<CamusRequest>() {
public int compare(CamusRequest r1, CamusRequest r2) {
return r1.getTopic().compareTo(r2.getTopic());
}
});
log.info("The requests from kafka metadata are: \n" + finalRequests);
writeRequests(finalRequests, context);
Map<CamusRequest, EtlKey> offsetKeys = getPreviousOffsets(
FileInputFormat.getInputPaths(context), context);
Set<String> moveLatest = getMoveToLatestTopicsSet(context);
for (CamusRequest request : finalRequests) {
if (moveLatest.contains(request.getTopic())
|| moveLatest.contains("all")) {
log.info("Moving to latest for topic: " + request.getTopic());
//TODO: factor out kafka specific request functionality
EtlKey oldKey = offsetKeys.get(request);
EtlKey newKey = new EtlKey(request.getTopic(), ((EtlRequest)request).getLeaderId(),
request.getPartition(), 0, request
.getLastOffset());
if (oldKey != null)
newKey.setMessageSize(oldKey.getMessageSize());
offsetKeys.put(request, newKey);
}
EtlKey key = offsetKeys.get(request);
if (key != null) {
request.setOffset(key.getOffset());
request.setAvgMsgSize(key.getMessageSize());
}
if (request.getEarliestOffset() > request.getOffset()
|| request.getOffset() > request.getLastOffset()) {
if(request.getEarliestOffset() > request.getOffset())
{
log.error("The earliest offset was found to be more than the current offset: " + request);
log.error("Moving to the earliest offset available");
}
else
{
log.error("The current offset was found to be more than the latest offset: " + request);
log.error("Moving to the earliest offset available");
}
request.setOffset(request.getEarliestOffset());
offsetKeys.put(
request,
//TODO: factor out kafka specific request functionality
new EtlKey(request.getTopic(), ((EtlRequest)request).getLeaderId(),
request.getPartition(), 0, request
.getOffset()));
}
log.info(request);
}
writePrevious(offsetKeys.values(), context);
CamusJob.stopTiming("getSplits");
CamusJob.startTiming("hadoop");
CamusJob.setTime("hadoop_start");
WorkAllocator allocator = getWorkAllocator(context);
Properties props = new Properties();
props.putAll(context.getConfiguration().getValByRegex(".*"));
allocator.init(props);
return allocator.allocateWork(finalRequests, context);
}
private Set<String> getMoveToLatestTopicsSet(JobContext context) {
Set<String> topics = new HashSet<String>();
String[] arr = getMoveToLatestTopics(context);
if (arr != null) {
for (String topic : arr) {
topics.add(topic);
}
}
return topics;
}
private boolean createMessageDecoder(JobContext context, String topic) {
try {
MessageDecoderFactory.createMessageDecoder(context, topic);
return true;
} catch (Exception e) {
log.error("failed to create decoder", e);
return false;
}
}
private void writePrevious(Collection<EtlKey> missedKeys, JobContext context)
throws IOException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Path output = FileOutputFormat.getOutputPath(context);
if (fs.exists(output)) {
fs.mkdirs(output);
}
output = new Path(output, EtlMultiOutputFormat.OFFSET_PREFIX
+ "-previous");
SequenceFile.Writer writer = SequenceFile.createWriter(fs,
context.getConfiguration(), output, EtlKey.class,
NullWritable.class);
for (EtlKey key : missedKeys) {
writer.append(key, NullWritable.get());
}
writer.close();
}
private void writeRequests(List<CamusRequest> requests, JobContext context)
throws IOException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Path output = FileOutputFormat.getOutputPath(context);
if (fs.exists(output)) {
fs.mkdirs(output);
}
output = new Path(output, EtlMultiOutputFormat.REQUESTS_FILE);
SequenceFile.Writer writer = SequenceFile.createWriter(fs,
context.getConfiguration(), output, EtlRequest.class,
NullWritable.class);
for (CamusRequest r : requests) {
//TODO: factor out kafka specific request functionality
writer.append((EtlRequest) r, NullWritable.get());
}
writer.close();
}
private Map<CamusRequest, EtlKey> getPreviousOffsets(Path[] inputs,
JobContext context) throws IOException {
Map<CamusRequest, EtlKey> offsetKeysMap = new HashMap<CamusRequest, EtlKey>();
for (Path input : inputs) {
FileSystem fs = input.getFileSystem(context.getConfiguration());
for (FileStatus f : fs.listStatus(input, new OffsetFileFilter())) {
log.info("previous offset file:" + f.getPath().toString());
SequenceFile.Reader reader = new SequenceFile.Reader(fs,
f.getPath(), context.getConfiguration());
EtlKey key = new EtlKey();
while (reader.next(key, NullWritable.get())) {
//TODO: factor out kafka specific request functionality
CamusRequest request = new EtlRequest(context,
key.getTopic(), key.getLeaderId(),
key.getPartition());
if (offsetKeysMap.containsKey(request)) {
EtlKey oldKey = offsetKeysMap.get(request);
if (oldKey.getOffset() < key.getOffset()) {
offsetKeysMap.put(request, key);
}
} else {
offsetKeysMap.put(request, key);
}
key = new EtlKey();
}
reader.close();
}
}
return offsetKeysMap;
}
public static void setWorkAllocator(JobContext job, Class<WorkAllocator> val) {
job.getConfiguration().setClass(CAMUS_WORK_ALLOCATOR_CLASS, val, WorkAllocator.class);
}
public static WorkAllocator getWorkAllocator(JobContext job) {
try {
return (WorkAllocator) job.getConfiguration().getClass(CAMUS_WORK_ALLOCATOR_CLASS, Class.forName(CAMUS_WORK_ALLOCATOR_DEFAULT)).newInstance();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static void setMoveToLatestTopics(JobContext job, String val) {
job.getConfiguration().set(KAFKA_MOVE_TO_LAST_OFFSET_LIST, val);
}
public static String[] getMoveToLatestTopics(JobContext job) {
return job.getConfiguration()
.getStrings(KAFKA_MOVE_TO_LAST_OFFSET_LIST);
}
public static void setKafkaClientBufferSize(JobContext job, int val) {
job.getConfiguration().setInt(KAFKA_CLIENT_BUFFER_SIZE, val);
}
public static int getKafkaClientBufferSize(JobContext job) {
return job.getConfiguration().getInt(KAFKA_CLIENT_BUFFER_SIZE,
2 * 1024 * 1024);
}
public static void setKafkaClientTimeout(JobContext job, int val) {
job.getConfiguration().setInt(KAFKA_CLIENT_SO_TIMEOUT, val);
}
public static int getKafkaClientTimeout(JobContext job) {
return job.getConfiguration().getInt(KAFKA_CLIENT_SO_TIMEOUT, 60000);
}
public static void setKafkaMaxPullHrs(JobContext job, int val) {
job.getConfiguration().setInt(KAFKA_MAX_PULL_HRS, val);
}
public static int getKafkaMaxPullHrs(JobContext job) {
return job.getConfiguration().getInt(KAFKA_MAX_PULL_HRS, -1);
}
public static void setKafkaMaxPullMinutesPerTask(JobContext job, int val) {
job.getConfiguration().setInt(KAFKA_MAX_PULL_MINUTES_PER_TASK, val);
}
public static int getKafkaMaxPullMinutesPerTask(JobContext job) {
return job.getConfiguration().getInt(KAFKA_MAX_PULL_MINUTES_PER_TASK,
-1);
}
public static void setKafkaMaxHistoricalDays(JobContext job, int val) {
job.getConfiguration().setInt(KAFKA_MAX_HISTORICAL_DAYS, val);
}
public static int getKafkaMaxHistoricalDays(JobContext job) {
return job.getConfiguration().getInt(KAFKA_MAX_HISTORICAL_DAYS, -1);
}
public static void setKafkaBlacklistTopic(JobContext job, String val) {
job.getConfiguration().set(KAFKA_BLACKLIST_TOPIC, val);
}
public static String[] getKafkaBlacklistTopic(JobContext job) {
if (job.getConfiguration().get(KAFKA_BLACKLIST_TOPIC) != null
&& !job.getConfiguration().get(KAFKA_BLACKLIST_TOPIC).isEmpty()) {
return job.getConfiguration().getStrings(KAFKA_BLACKLIST_TOPIC);
} else {
return new String[] {};
}
}
public static void setKafkaWhitelistTopic(JobContext job, String val) {
job.getConfiguration().set(KAFKA_WHITELIST_TOPIC, val);
}
public static String[] getKafkaWhitelistTopic(JobContext job) {
if (job.getConfiguration().get(KAFKA_WHITELIST_TOPIC) != null
&& !job.getConfiguration().get(KAFKA_WHITELIST_TOPIC).isEmpty()) {
return job.getConfiguration().getStrings(KAFKA_WHITELIST_TOPIC);
} else {
return new String[] {};
}
}
public static void setEtlIgnoreSchemaErrors(JobContext job, boolean val) {
job.getConfiguration().setBoolean(ETL_IGNORE_SCHEMA_ERRORS, val);
}
public static boolean getEtlIgnoreSchemaErrors(JobContext job) {
return job.getConfiguration().getBoolean(ETL_IGNORE_SCHEMA_ERRORS,
false);
}
public static void setEtlAuditIgnoreServiceTopicList(JobContext job,
String topics) {
job.getConfiguration().set(ETL_AUDIT_IGNORE_SERVICE_TOPIC_LIST, topics);
}
public static String[] getEtlAuditIgnoreServiceTopicList(JobContext job) {
return job.getConfiguration().getStrings(
ETL_AUDIT_IGNORE_SERVICE_TOPIC_LIST, "");
}
public static void setMessageDecoderClass(JobContext job,
Class<MessageDecoder> cls) {
job.getConfiguration().setClass(CAMUS_MESSAGE_DECODER_CLASS, cls,
MessageDecoder.class);
}
public static Class<MessageDecoder> getMessageDecoderClass(JobContext job) {
return (Class<MessageDecoder>) job.getConfiguration().getClass(
CAMUS_MESSAGE_DECODER_CLASS, KafkaAvroMessageDecoder.class);
}
private class OffsetFileFilter implements PathFilter {
@Override
public boolean accept(Path arg0) {
return arg0.getName()
.startsWith(EtlMultiOutputFormat.OFFSET_PREFIX);
}
}
}