Package com.manning.hip.ch12.crunch

Source Code of com.manning.hip.ch12.crunch.JoinLogsAndUsers

package com.manning.hip.ch12.crunch;

import com.cloudera.crunch.*;
import com.cloudera.crunch.impl.mr.MRPipeline;
import com.cloudera.crunch.lib.Join;
import com.cloudera.crunch.type.PTypeFamily;
import com.manning.hip.common.CommonLogEntry;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import java.io.IOException;

/**
* Parse Apache logs and join them with a separate users file.
*/
public class JoinLogsAndUsers {
  public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    Path output = new Path(args[2]);
    output.getFileSystem(conf).delete(output, true);

    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(JoinLogsAndUsers.class, conf);

    // Reference a given text file as a collection of Strings.
    PCollection<String> rawLogs = pipeline.readTextFile(args[0]);

    // Reference a given text file as a collection of Strings.
    PCollection<String> rawUsers = pipeline.readTextFile(args[1]);

    // Define a function that splits each line in a PCollection of Strings into a
    // PCollection made up of the individual words in the file.
    PTable<String, CommonLogEntry> logs = logsAsIpTable(CrunchUtils.logs(rawLogs));

    PTable<String, String> ipsAndUsers = ipsAndUsers(rawUsers);

    PTable<String, Pair<String, CommonLogEntry>> joined = Join.join(ipsAndUsers, logs);

    for(Pair<String, Pair<String, CommonLogEntry>> j: joined.materialize()) {
      System.out.println(j.first() + " " + j.second().first());
    }
  }

  public static PTable<String, CommonLogEntry> logsAsIpTable(PCollection<CommonLogEntry> logs) {
    PTypeFamily tf = logs.getTypeFamily();
    return logs.parallelDo(
        "logs-to-ip-table",
        new DoFn<CommonLogEntry, Pair<String, CommonLogEntry>>() {
          @Override
          public void process(CommonLogEntry input, Emitter<Pair<String, CommonLogEntry>> emitter) {
            emitter.emit(Pair.of(input.getRemoteAddress(), input));
          }
        }, tf.tableOf(tf.strings(), tf.records(CommonLogEntry.class)));
  }

  public static PTable<String, String> ipsAndUsers(PCollection<String> ipUsers) {
    PTypeFamily tf = ipUsers.getTypeFamily();
    return ipUsers.parallelDo(
        "extract-users",
        new DoFn<String, Pair<String, String>>() {
          @Override
          public void process(String input, Emitter<Pair<String, String>> emitter) {
            // first token is the IP address, and second is the username
            String[] parts = StringUtils.split(input);
            emitter.emit(Pair.of(parts[0], parts[1]));
          }
        }, tf.tableOf(tf.strings(), tf.strings()));
  }
}
TOP

Related Classes of com.manning.hip.ch12.crunch.JoinLogsAndUsers

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.