Package com.manning.hip.ch12.crunch

Source Code of com.manning.hip.ch12.crunch.PopularLinks

package com.manning.hip.ch12.crunch;

import com.cloudera.crunch.*;
import com.cloudera.crunch.impl.mr.MRPipeline;
import com.cloudera.crunch.lib.Aggregate;
import com.cloudera.crunch.type.PTypeFamily;
import com.manning.hip.common.CommonLogEntry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import java.io.IOException;

/**
* Parse Apache logs, extract resources, group them and
* then order by popularity.
*/
public class PopularLinks {
  public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    Path output = new Path(args[1]);
    output.getFileSystem(conf).delete(output, true);

    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(PopularLinks.class, conf);

    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Define a function that splits each line in a PCollection of Strings into a
    // PCollection made up of the individual words in the file.
    PCollection<CommonLogEntry> logs = CrunchUtils.logs(lines);

    PCollection<String> resources = extractFilterResources(logs);

    PTable<String, Long> counts = Aggregate.count(resources);

    // Instruct the pipeline to write the resulting counts to a text file.
    pipeline.writeTextFile(counts, args[1]);
    // Execute the pipeline as a MapReduce.
    pipeline.done();
  }

  public static PCollection<String> extractFilterResources(PCollection<CommonLogEntry> logs) {
    PTypeFamily tf = logs.getTypeFamily();
    return logs.parallelDo(
        "resource-extract-filter",
        new DoFn<CommonLogEntry, String>() {
          @Override
          public void process(CommonLogEntry input, Emitter<String> emitter) {
            if (!"127.0.0.1".equals(input.getRemoteAddress())) {
              emitter.emit(input.getResource());
            }
          }
        }, tf.strings());
  }
}
TOP

Related Classes of com.manning.hip.ch12.crunch.PopularLinks

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.