Examples of org.apache.crunch.Pipeline

org.apache.crunch.Pipeline
Manages the state of a pipeline execution.

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(SecondarySortExample.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Define a function that parses each line in a PCollection of Strings into
    // a pair of pairs, the first of which will be grouped by (first member) and
    // the sorted by (second memeber). The second pair is payload which can be
    // passed in an Iterable object.
    PTable<String, Pair<Long, String>> pairs = lines.parallelDo("extract_records",
        new DoFn<String, Pair<String, Pair<Long, String>>>() {
          @Override
          public void process(String line, Emitter<Pair<String, Pair<Long, String>>> emitter) {
            int i = 0;
            String key = "";
            long timestamp = 0;
            String value = "";
            for (String element : INPUT_SPLITTER.split(line)) {
              switch (++i) {
              case 1:
                key = element;
                break;
              case 2:
                try {
                  timestamp = Long.parseLong(element);
                } catch (NumberFormatException e) {
                  System.out.println("Timestamp not in long format '" + line + "'");
                  this.increment(COUNTERS.CORRUPT_TIMESTAMP);
                }
                break;
              case 3:
                value = element;
                break;
              default:
                System.err.println("i = " + i + " should never happen!");
                break;
              }
            }
            if (i == 3) {
              Long sortby = new Long(timestamp);
              emitter.emit(Pair.of(key, Pair.of(sortby, value)));
            } else {
              this.increment(COUNTERS.CORRUPT_LINE);
            }
          }}, Avros.tableOf(Avros.strings(), Avros.pairs(Avros.longs(), Avros.strings())));


    // The output of the above input will be (with one reducer):


    // one : [[-10,garbage],[-5,10],[1,1],[2,-3]]
    // three : [[0,-1]]
    // two : [[1,7,9],[2,6],[4,5]]


    SecondarySort.sortAndApply(pairs,
        new DoFn<Pair<String, Iterable<Pair<Long, String>>>, String>() {
          final StringBuilder sb = new StringBuilder();
          @Override
          public void process(Pair<String, Iterable<Pair<Long, String>>> input, Emitter<String> emitter) {
            sb.setLength(0);
            sb.append(input.first());
            sb.append(" : [");
            boolean first = true;
            for(Pair<Long, String> pair : input.second()) {
              if (first) {
                first = false;
              } else {
                sb.append(',');
              }
              sb.append(pair);
            }
            sb.append("]");
            emitter.emit(sb.toString());
          }
        }, Writables.strings()).write(To.textFile(args[1]));


    // Execute the pipeline as a MapReduce.
    return pipeline.done().succeeded() ? 0 : 1;
  }

View Full Code Here

  }


  @Test
  public void testHFileTarget() throws IOException {
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<KeyValue> wordCountKeyValues = convertToKeyValues(wordCounts);
    pipeline.write(wordCountKeyValues, ToHBase.hfile(outputPath));


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    FileSystem fs = FileSystem.get(conf);
    KeyValue kv = readFromHFiles(fs, outputPath, "and");
    assertEquals(427L, Bytes.toLong(kv.getValue()));

View Full Code Here

  }


  @Test
  public void testBulkLoad() throws Exception {
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HTable testTable = createTable(26);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration())
        .doBulkLoad(outputPath, testTable);

View Full Code Here


  /** See CRUNCH-251 */
  @Test
  public void testMultipleHFileTargets() throws Exception {
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath1 = getTempPathOnHDFS("out1");
    Path outputPath2 = getTempPathOnHDFS("out2");
    HTable table1 = createTable(10);
    HTable table2 = createTable(20);
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration());


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PCollection<String> shortWords = words.filter(SHORT_WORD_FILTER);
    PCollection<String> longWords = words.filter(FilterFns.not(SHORT_WORD_FILTER));
    PTable<String, Long> shortWordCounts = shortWords.count();
    PTable<String, Long> longWordCounts = longWords.count();
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(shortWordCounts),
        table1,
        outputPath1);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(longWordCounts),
        table2,
        outputPath2);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());
    loader.doBulkLoad(outputPath1, table1);
    loader.doBulkLoad(outputPath2, table2);


    assertEquals(396L, getWordCountFromTable(table1, "of"));

View Full Code Here

  public void testHFileUsesFamilyConfig() throws IOException {
    DataBlockEncoding newBlockEncoding = DataBlockEncoding.PREFIX;
    assertNotSame(newBlockEncoding, DataBlockEncoding.valueOf(HColumnDescriptor.DEFAULT_DATA_BLOCK_ENCODING));


    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
    hcol.setDataBlockEncoding(newBlockEncoding);
    HTable testTable = createTable(10, hcol);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    int hfilesCount = 0;
    FileSystem fs = outputPath.getFileSystem(conf);
    for (FileStatus e : fs.listStatus(new Path(outputPath, Bytes.toString(TEST_FAMILY)))) {

View Full Code Here

    List<KeyValue> kvs = generateKeyValues(100);
    Path inputPath = tmpDir.getPath("in");
    Path outputPath = tmpDir.getPath("out");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> in = pipeline.read(FromHBase.hfile(inputPath));
    PCollection<String> texts = in.parallelDo(new MapFn<KeyValue, String>() {
      @Override
      public String map(KeyValue input) {
        return input.toString();
      }
    }, strings());
    texts.write(To.textFile(outputPath));
    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    List<String> lines = FileUtils.readLines(new File(outputPath.toString(), "part-m-00000"));
    assertEquals(kvs.size(), lines.size());
    for (int i = 0; i < kvs.size(); i++) {

View Full Code Here


  private List<Result> doTestScanHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<Result> results = HFileUtils.scanHFiles(pipeline, inputPath, scan);
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here


  private List<KeyValue> doTestReadHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> results = pipeline.read(FromHBase.hfile(inputPath));
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here

    server.stop();
  }


  @Test
  public void testReadFromSource() throws Exception {
    Pipeline pipeline = new MRPipeline(DataBaseSourceIT.class);
    DataBaseSource<IdentifiableName> dbsrc = new DataBaseSource.Builder<IdentifiableName>(IdentifiableName.class)
        .setDriverClass(org.h2.Driver.class)
        .setUrl("jdbc:h2:tcp://localhost/~/test").setUsername("sa").setPassword("")
        .selectSQLQuery("SELECT ID, NAME FROM TEST").countSQLQuery("select count(*) from Test").build();


    PCollection<IdentifiableName> cdidata = pipeline.read(dbsrc);
    PCollection<String> names = cdidata.parallelDo(new DoFn<IdentifiableName, String>() {


      @Override
      public void process(IdentifiableName input, Emitter<String> emitter) {
        emitter.emit(input.name.toString());
      }


    }, Writables.strings());


    List<String> nameList = Lists.newArrayList(names.materialize());
    pipeline.done();


    assertEquals(2, nameList.size());
    assertEquals(Sets.newHashSet("Hello", "World"), Sets.newHashSet(nameList));


  }

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Combiner used for summing up response size and count
    CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS,
        CombineFn.SUM_LONGS);


    // Table of (ip, sum(response size), count)
    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
        .parallelDo(extractResponseSize,
            Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
        .combineValues(stringPairOfLongsSumCombiner);


    // Calculate average response size by ip address
    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
        Writables.tableOf(Writables.strings(), Writables.doubles()));


    // write the result to a text file
    pipeline.writeTextFile(avgs, args[1]);
    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.crunch.Pipeline

$.WordCount

com.cloudera.cdk.data.crunch.TestCrunchDatasets

org.apache.bigtop.bigpetstore.etl.CrunchETL

org.apache.crunch.contrib.io.jdbc.DataBaseSourceIT

org.apache.crunch.examples.AverageBytesByIP

org.apache.crunch.examples.SecondarySortExample

org.apache.crunch.examples.TotalBytesByIP

org.apache.crunch.examples.TotalWordCount

org.apache.crunch.examples.WordAggregationHBase

org.apache.crunch.examples.WordCount

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.