package com.manning.hip.ch12.crunch;
import com.cloudera.crunch.*;
import com.cloudera.crunch.impl.mr.MRPipeline;
import com.cloudera.crunch.lib.Join;
import com.cloudera.crunch.type.PTypeFamily;
import com.manning.hip.common.CommonLogEntry;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
/**
* Parse Apache logs and join them with a separate users file.
*/
public class JoinLogsAndUsers {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
Path output = new Path(args[2]);
output.getFileSystem(conf).delete(output, true);
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(JoinLogsAndUsers.class, conf);
// Reference a given text file as a collection of Strings.
PCollection<String> rawLogs = pipeline.readTextFile(args[0]);
// Reference a given text file as a collection of Strings.
PCollection<String> rawUsers = pipeline.readTextFile(args[1]);
// Define a function that splits each line in a PCollection of Strings into a
// PCollection made up of the individual words in the file.
PTable<String, CommonLogEntry> logs = logsAsIpTable(CrunchUtils.logs(rawLogs));
PTable<String, String> ipsAndUsers = ipsAndUsers(rawUsers);
PTable<String, Pair<String, CommonLogEntry>> joined = Join.join(ipsAndUsers, logs);
for(Pair<String, Pair<String, CommonLogEntry>> j: joined.materialize()) {
System.out.println(j.first() + " " + j.second().first());
}
}
public static PTable<String, CommonLogEntry> logsAsIpTable(PCollection<CommonLogEntry> logs) {
PTypeFamily tf = logs.getTypeFamily();
return logs.parallelDo(
"logs-to-ip-table",
new DoFn<CommonLogEntry, Pair<String, CommonLogEntry>>() {
@Override
public void process(CommonLogEntry input, Emitter<Pair<String, CommonLogEntry>> emitter) {
emitter.emit(Pair.of(input.getRemoteAddress(), input));
}
}, tf.tableOf(tf.strings(), tf.records(CommonLogEntry.class)));
}
public static PTable<String, String> ipsAndUsers(PCollection<String> ipUsers) {
PTypeFamily tf = ipUsers.getTypeFamily();
return ipUsers.parallelDo(
"extract-users",
new DoFn<String, Pair<String, String>>() {
@Override
public void process(String input, Emitter<Pair<String, String>> emitter) {
// first token is the IP address, and second is the username
String[] parts = StringUtils.split(input);
emitter.emit(Pair.of(parts[0], parts[1]));
}
}, tf.tableOf(tf.strings(), tf.strings()));
}
}