Examples of com.cloudera.crunch.impl.mr.MRPipeline

com.cloudera.crunch.impl.mr.MRPipeline


public class SortCrunchTest implements Serializable {
  
  @Test
  public void test() throws IOException {
    Pipeline pipeline = new MRPipeline(SortCrunchTest.class);
    PCollection<String> records = pipeline.readTextFile("sort/A");
    
    PCollection<Pair<Integer, Integer>> pairs = records.parallelDo(new DoFn<String, Pair<Integer, Integer>>() {
      @Override
      public void process(String input, Emitter<Pair<Integer, Integer>> emitter) {
        Iterator<String> split = Splitter.on('\t').split(input).iterator();
        String l = split.next();
        String r = split.next();
        emitter.emit(Pair.of(Integer.parseInt(l), Integer.parseInt(r)));
      }
    }, pairs(ints(), ints()));
    
    PCollection<Pair<Integer, Integer>> sorted = Sort.sortPairs(pairs, by(1, ASCENDING), by(2, DESCENDING));
    
    pipeline.writeTextFile(sorted, "output-sorted");
    pipeline.run();
  }

View Full Code Here

  
  private static final int MISSING = 9999;
  
  @Test
  public void test() throws IOException {
    Pipeline pipeline = new MRPipeline(MaxTemperatureCrunchTest.class);
    PCollection<String> records = pipeline.readTextFile("input");
    
    PTable<String, Integer> maxTemps = records
      .parallelDo(toYearTempPairsFn(), tableOf(strings(), ints()))
      .groupByKey()
      .combineValues(CombineFn.<String> MAX_INTS());
    
    pipeline.writeTextFile(maxTemps, "output");
    pipeline.run();
  }

View Full Code Here


public class JoinCrunchTest implements Serializable {
  
  @Test
  public void test() throws IOException {
    Pipeline pipeline = new MRPipeline(JoinCrunchTest.class);
    PCollection<String> a = pipeline.readTextFile("join/A");
    PCollection<String> b = pipeline.readTextFile("join/B");
    
    PTable<String, String> aTable = a.parallelDo(new DoFn<String, Pair<String, String>>() {
    @Override
    public void process(String input, Emitter<Pair<String, String>> emitter) {
      Iterator<String> split = Splitter.on('\t').split(input).iterator();
      emitter.emit(Pair.of(split.next(), split.next()));
    }
  }, tableOf(strings(),strings()));


    PTable<String, String> bTable = b.parallelDo(new DoFn<String, Pair<String, String>>() {
    @Override
    public void process(String input, Emitter<Pair<String, String>> emitter) {
      Iterator<String> split = Splitter.on('\t').split(input).iterator();
      String l = split.next();
      String r = split.next();
      emitter.emit(Pair.of(r, l));
    }
  }, tableOf(strings(),strings()));
    
    PTable<String, Pair<String, String>> join = Join.join(aTable, bTable);
    
    pipeline.writeTextFile(join, "output-joined");
    pipeline.run();
  }

View Full Code Here


public class CogroupCrunchTest implements Serializable {
  
  @Test
  public void test() throws IOException {
    Pipeline pipeline = new MRPipeline(CogroupCrunchTest.class);
    PCollection<String> a = pipeline.readTextFile("join/A");
    PCollection<String> b = pipeline.readTextFile("join/B");
    
    PTable<String, String> aTable = a.parallelDo(new DoFn<String, Pair<String, String>>() {
    @Override
    public void process(String input, Emitter<Pair<String, String>> emitter) {
      Iterator<String> split = Splitter.on('\t').split(input).iterator();
      emitter.emit(Pair.of(split.next(), split.next()));
    }
  }, tableOf(strings(),strings()));


    PTable<String, String> bTable = b.parallelDo(new DoFn<String, Pair<String, String>>() {
    @Override
    public void process(String input, Emitter<Pair<String, String>> emitter) {
      Iterator<String> split = Splitter.on('\t').split(input).iterator();
      String l = split.next();
      String r = split.next();
      emitter.emit(Pair.of(r, l));
    }
  }, tableOf(strings(),strings()));
    
    PTable<String, Pair<Collection<String>, Collection<String>>> cogroup = Cogroup.cogroup(aTable, bTable);
    
    pipeline.writeTextFile(cogroup, "output-cogrouped");
    pipeline.run();
  }

View Full Code Here

  public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    Path output = new Path(args[1]);
    output.getFileSystem(conf).delete(output, true);


    Pipeline pipeline = new MRPipeline(SimpleTokenize.class, conf);


    PCollection<String> lines = pipeline.readTextFile(args[0]);


    PCollection<String> words = lines.parallelDo(
        "tokenize",
        new DoFn<String, String>() {
          @Override
          public void process(String line,
                              Emitter<String> emitter) {
            for (String word : StringUtils.split(line)) {
              emitter.emit(word);
            }
          }
        }, Writables.strings()); // Indicates the serialization format


    pipeline.writeTextFile(words, args[1]);


    pipeline.done();
  }

View Full Code Here

    Configuration conf = new Configuration();
    Path output = new Path(args[1]);
    output.getFileSystem(conf).delete(output, true);


    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(InvertedIndex.class, conf);


    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Define a function that splits each line in a PCollection of Strings into a
    // PCollection made up of the individual words in the file.
    PTable<String, String> wordDocs = CrunchUtils.extractWordFileTable(lines);


    PTable<String, String> result = CrunchUtils.uniqueValues(wordDocs);


    // Instruct the pipeline to write the resulting counts to a text file.
    pipeline.writeTextFile(result, args[1]);
    // Execute the pipeline as a MapReduce.
    pipeline.done();
  }

View Full Code Here

    Configuration conf = new Configuration();
    Path output = new Path(args[1]);
    output.getFileSystem(conf).delete(output, true);


    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(PopularLinks.class, conf);


    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Define a function that splits each line in a PCollection of Strings into a
    // PCollection made up of the individual words in the file.
    PCollection<CommonLogEntry> logs = CrunchUtils.logs(lines);


    PCollection<String> resources = extractFilterResources(logs);


    PTable<String, Long> counts = Aggregate.count(resources);


    // Instruct the pipeline to write the resulting counts to a text file.
    pipeline.writeTextFile(counts, args[1]);
    // Execute the pipeline as a MapReduce.
    pipeline.done();
  }

View Full Code Here

    Configuration conf = new Configuration();
    Path output = new Path(args[2]);
    output.getFileSystem(conf).delete(output, true);


    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(JoinLogsAndUsers.class, conf);


    // Reference a given text file as a collection of Strings.
    PCollection<String> rawLogs = pipeline.readTextFile(args[0]);


    // Reference a given text file as a collection of Strings.
    PCollection<String> rawUsers = pipeline.readTextFile(args[1]);


    // Define a function that splits each line in a PCollection of Strings into a
    // PCollection made up of the individual words in the file.
    PTable<String, CommonLogEntry> logs = logsAsIpTable(CrunchUtils.logs(rawLogs));

View Full Code Here

TOP

Related Classes of com.cloudera.crunch.impl.mr.MRPipeline

com.manning.hip.ch12.crunch.InvertedIndex

com.manning.hip.ch12.crunch.JoinLogsAndUsers

com.manning.hip.ch12.crunch.PopularLinks

com.manning.hip.ch12.crunch.SimpleTokenize

crunch.CogroupCrunchTest

crunch.JoinCrunchTest

crunch.MaxTemperatureCrunchTest

crunch.SortCrunchTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.