Examples of org.apache.crunch.impl.mr.MRPipeline

org.apache.crunch.impl.mr.MRPipeline
Pipeline implementation that is executed within Hadoop MapReduce.


  private List<KeyValue> doTestReadHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> results = pipeline.read(FromHBase.hfile(inputPath));
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Aggregator used for summing up response size and count
    Aggregator<Pair<Long, Long>> agg = pairAggregator(SUM_LONGS(), SUM_LONGS());


    // Table of (ip, sum(response size), count)
    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
        .parallelDo(extractResponseSize,
            Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
        .combineValues(agg);


    // Calculate average response size by ip address
    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
        Writables.tableOf(Writables.strings(), Writables.doubles()));


    // write the result to a text file
    pipeline.writeTextFile(avgs, args[1]);
    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalWordCount.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Define a function that splits each line in a PCollection of Strings into
    // a
    // PCollection made up of the individual words in the file.
    PCollection<Long> numberOfWords = lines.parallelDo(new DoFn<String, Long>() {
      public void process(String line, Emitter<Long> emitter) {
        emitter.emit((long)line.split("\\s+").length);
      }
    }, Writables.longs()); // Indicates the serialization format


    // The aggregate method groups a collection into a single PObject.
    PObject<Long> totalCount = numberOfWords.aggregate(Aggregators.SUM_LONGS()).first();


    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.run();


    System.out.println("Total number of words: " + totalCount.getValue());
    
    pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

    createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));


    putInHbase(putList, configuration);


    // We create the pipeline which will handle most of the job.
    Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());


    // The scan which will retrieve the data from the source in hbase.
    Scan scan = new Scan();
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);


    // Our hbase source
    HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);


    // Our source, in a format which can be use by crunch
    PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);


    // We process the data from the source HTable then concatenate all data
    // with the same rowkey
    PTable<String, String> textExtracted = extractText(rawText);
    PTable<String, String> result = textExtracted.groupByKey()
        .combineValues(Aggregators.STRING_CONCAT(" ",  true));


    // We create the collection of puts from the concatenated datas
    PCollection<Put> resultPut = createPut(result);


    // We write the puts in hbase, in the target table
    pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));


    pipeline.done();
    return 0;
  }

View Full Code Here

    jos.closeEntry();
  }


  @Test
  public void testWordCount() throws IOException {
    run(new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration()));
  }

View Full Code Here

    pipeline.done();


    //verify HBaseTarget supports deletes.
    Scan clearScan = new Scan();
    clearScan.addFamily(COUNTS_COLFAM);
    pipeline = new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration());
    HBaseSourceTarget clearSource = new HBaseSourceTarget(outputTableName, clearScan);
    PTable<ImmutableBytesWritable, Result> counts = pipeline.read(clearSource);
    pipeline.write(clearCounts(counts), new HBaseTarget(outputTableName));
    pipeline.done();

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(SecondarySortExample.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Define a function that parses each line in a PCollection of Strings into
    // a pair of pairs, the first of which will be grouped by (first member) and
    // the sorted by (second memeber). The second pair is payload which can be
    // passed in an Iterable object.
    PTable<String, Pair<Long, String>> pairs = lines.parallelDo("extract_records",
        new DoFn<String, Pair<String, Pair<Long, String>>>() {
          @Override
          public void process(String line, Emitter<Pair<String, Pair<Long, String>>> emitter) {
            int i = 0;
            String key = "";
            long timestamp = 0;
            String value = "";
            for (String element : INPUT_SPLITTER.split(line)) {
              switch (++i) {
              case 1:
                key = element;
                break;
              case 2:
                try {
                  timestamp = Long.parseLong(element);
                } catch (NumberFormatException e) {
                  System.out.println("Timestamp not in long format '" + line + "'");
                  this.increment(COUNTERS.CORRUPT_TIMESTAMP);
                }
                break;
              case 3:
                value = element;
                break;
              default:
                System.err.println("i = " + i + " should never happen!");
                break;
              }
            }
            if (i == 3) {
              Long sortby = new Long(timestamp);
              emitter.emit(Pair.of(key, Pair.of(sortby, value)));
            } else {
              this.increment(COUNTERS.CORRUPT_LINE);
            }
          }}, Avros.tableOf(Avros.strings(), Avros.pairs(Avros.longs(), Avros.strings())));


    // The output of the above input will be (with one reducer):


    // one : [[-10,garbage],[-5,10],[1,1],[2,-3]]
    // three : [[0,-1]]
    // two : [[1,7,9],[2,6],[4,5]]


    SecondarySort.sortAndApply(pairs,
        new DoFn<Pair<String, Iterable<Pair<Long, String>>>, String>() {
          final StringBuilder sb = new StringBuilder();
          @Override
          public void process(Pair<String, Iterable<Pair<Long, String>>> input, Emitter<String> emitter) {
            sb.setLength(0);
            sb.append(input.first());
            sb.append(" : [");
            boolean first = true;
            for(Pair<Long, String> pair : input.second()) {
              if (first) {
                first = false;
              } else {
                sb.append(',');
              }
              sb.append(pair);
            }
            sb.append("]");
            emitter.emit(sb.toString());
          }
        }, Writables.strings()).write(To.textFile(args[1]));


    // Execute the pipeline as a MapReduce.
    return pipeline.done().succeeded() ? 0 : 1;
  }

View Full Code Here

  }


  @Test
  public void testHFileTarget() throws IOException {
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<KeyValue> wordCountKeyValues = convertToKeyValues(wordCounts);
    pipeline.write(wordCountKeyValues, ToHBase.hfile(outputPath));


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    FileSystem fs = FileSystem.get(conf);
    KeyValue kv = readFromHFiles(fs, outputPath, "and");
    assertEquals(427L, Bytes.toLong(kv.getValue()));

View Full Code Here

  }


  @Test
  public void testBulkLoad() throws Exception {
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HTable testTable = createTable(26);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration())
        .doBulkLoad(outputPath, testTable);

View Full Code Here


  /** See CRUNCH-251 */
  @Test
  public void testMultipleHFileTargets() throws Exception {
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath1 = getTempPathOnHDFS("out1");
    Path outputPath2 = getTempPathOnHDFS("out2");
    HTable table1 = createTable(10);
    HTable table2 = createTable(20);
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration());


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PCollection<String> shortWords = words.filter(SHORT_WORD_FILTER);
    PCollection<String> longWords = words.filter(FilterFns.not(SHORT_WORD_FILTER));
    PTable<String, Long> shortWordCounts = shortWords.count();
    PTable<String, Long> longWordCounts = longWords.count();
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(shortWordCounts),
        table1,
        outputPath1);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(longWordCounts),
        table2,
        outputPath2);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());
    loader.doBulkLoad(outputPath1, table1);
    loader.doBulkLoad(outputPath2, table2);


    assertEquals(396L, getWordCountFromTable(table1, "of"));

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.crunch.impl.mr.MRPipeline

com.cloudera.cdk.data.crunch.TestCrunchDatasets

org.apache.bigtop.bigpetstore.etl.CrunchETL

org.apache.crunch.Breakpoint2IT

org.apache.crunch.BreakpointIT

org.apache.crunch.CancelJobsIT

org.apache.crunch.CheckpointIT

org.apache.crunch.CleanTextIT

org.apache.crunch.CollectionPObjectIT

org.apache.crunch.CollectionsIT

org.apache.crunch.CollectionsLengthIT

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.