Package com.manning.hip.ch12.crunch

Source Code of com.manning.hip.ch12.crunch.CrunchUtils

package com.manning.hip.ch12.crunch;

import com.cloudera.crunch.*;
import com.cloudera.crunch.type.PTypeFamily;
import com.manning.hip.common.ApacheCommonLogReader;
import com.manning.hip.common.CommonLogEntry;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.mapreduce.MapContext;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class CrunchUtils {

  /**
   * Returns the largest numerical element from the input collection.
   */
  public static <K> PTable<K, String> uniqueValues(PTable<K, String> collect) {
    return collect.groupByKey().combineValues(new CombineFn<K, String>() {

      @Override
      public void process(Pair<K, Iterable<String>> input,
                          Emitter<Pair<K, String>> emitter) {

        Set<String> filenames = new HashSet<String>();

        for (String filename : input.second()) {
          filenames.add(filename);
        }

        Pair<K, String> pair =
            Pair.of(input.first(), StringUtils.join(filenames, ","));
        emitter.emit(pair);
      }
    });
  }

  public static PTable<String, String> extractWordFileTable(PCollection<String> lines) {
    PTypeFamily tf = lines.getTypeFamily();
    return lines.parallelDo(
        "inverted-index",
        new DoFn<String, Pair<String, String>>() {
          String filename;

          @Override
          public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
            super.setContext(context);
            filename = ((FileSplit)
                ((MapContext) context).getInputSplit()).getPath().getName();
          }

          @Override
          public void process(String line,
                              Emitter<Pair<String, String>> emitter) {
            for (String word : StringUtils.split(line)) {
              Pair<String, String> pair =
                  Pair.of(word.toLowerCase(), filename);
              emitter.emit(pair);
            }
          }
        }, tf.tableOf(tf.strings(), tf.strings()));
  }

  public static enum LogCounters {
    LOG_LINE_ERRORS
  }

  public static PCollection<CommonLogEntry> logs(PCollection<String> lines) {
    PTypeFamily tf = lines.getTypeFamily();
    return lines
        .parallelDo(new DoFn<String, CommonLogEntry>() {
          transient ApacheCommonLogReader logReader;
          transient Logger log;

          @Override
          public void initialize() {
            logReader = new ApacheCommonLogReader();
            log = LoggerFactory.getLogger(CrunchUtils.class);
          }

          @Override
          public void process(String input, Emitter<CommonLogEntry> emitter) {
            try {
              CommonLogEntry log = logReader.decodeLine(input);
              if(log != null) {
                emitter.emit(log);
              } else {
                processingError(input, null);
              }
            } catch (IOException e) {
              processingError(input, e);
            }
          }

          void processingError(String line, @Nullable Throwable t) {
            super.getCounter(LogCounters.LOG_LINE_ERRORS).increment(1);
            log.error("Hit exception parsing line '" + line + "'", t);
          }
        }, tf.records(CommonLogEntry.class));
  }

}
TOP

Related Classes of com.manning.hip.ch12.crunch.CrunchUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.