* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package com.cloudera.flume.handlers.debug;
import java.io.IOException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.cloudera.flume.conf.SourceFactory.SourceBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.EventSource;
import com.cloudera.flume.core.Event.Priority;
import com.google.common.base.Preconditions;
* This adds a parser that breaks out parts of a log4j log file and tries to
* re-concatenate split lines.
* TODO (jon) check if this is the hadoop log4j default out of the box.
public class Log4jTextFileSource extends TextFileSource {
public Log4jTextFileSource(String fname) {
// This should parse a log4j line
Pattern l4jPat = Pattern
.compile("^(\\S+?) (\\S+?) (\\d\\d/\\d\\d/\\d\\d \\d\\d:\\d\\d:\\d\\d) (\\S+?) (.*)");
DateFormat dformat = new SimpleDateFormat("yy/MM/dd HH:mm:ss");
String prevLine = null;
* Reads lines until we either reach the end of a file, or a line that looks
* like the beginning of a new entry. If last line prevLine is set to null. If
* it is a new event, we make the prev line contain that value and return the
* completed event.
Event readUntilNextEvent(Matcher m) throws IOException {
try {
// new event!
String host = m.group(1);
// String xx = m.group(2); // TODO (jon) I don't know what this is
Date d;
d = dformat.parse(m.group(3));
String prio = m.group(4);
String body = m.group(5);
// this is really just to differentiate entries at the "same time" at
// second/millisecond resolution"
long nanos = System.nanoTime();
StringBuilder builder = new StringBuilder(body);
while (true) {
String s2 = raf.readLine();
if (s2 == null) {
// reached the last line
prevLine = null;
Event e = new EventImpl(builder.toString().getBytes(), d.getTime(),
Priority.valueOf(prio), nanos, host, new HashMap<String, byte[]>());
return e;
// valid line, need to get more lines.
Matcher m2 = l4jPat.matcher(s2);
if (m2.matches()) {
// a new matching line? event finished, save line for next round.
Event e = new EventImpl(builder.toString().getBytes(), d.getTime(),
Priority.valueOf(prio), nanos, host, new HashMap<String, byte[]>());
prevLine = s2;
return e;
// didn't match, append to previous
} catch (ParseException e1) {
// TODO XXX Auto-generated catch block
return null;
* Because of the semantics of next, we need to keep the previous line around.
public Event next() throws IOException {
Preconditions.checkState(raf != null,
"Need to open source before reading from it");
String s = null;
if (prevLine == null) {
s = raf.readLine();
if (s == null)
return null; // last line
// first line (fall through)
} else {
// event break
s = prevLine;
Matcher m = l4jPat.matcher(s);
if (m.matches()) {
Event e = readUntilNextEvent(m);
return e;
// This is unreachable.
return null;
public static SourceBuilder builder() {
return new SourceBuilder() {
public EventSource build(String... argv) {
if (argv.length != 1) {
throw new IllegalArgumentException("usage: log4jfile(filename) ");
return new Log4jTextFileSource(argv[0]);