Examples of org.apache.crunch.impl.mr.MRPipeline

org.apache.crunch.impl.mr.MRPipeline
Pipeline implementation that is executed within Hadoop MapReduce.

  public void testHFileUsesFamilyConfig() throws IOException {
    DataBlockEncoding newBlockEncoding = DataBlockEncoding.PREFIX;
    assertNotSame(newBlockEncoding, DataBlockEncoding.valueOf(HColumnDescriptor.DEFAULT_DATA_BLOCK_ENCODING));


    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, conf);
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
    hcol.setDataBlockEncoding(newBlockEncoding);
    HTable testTable = createTable(10, hcol);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    int hfilesCount = 0;
    FileSystem fs = outputPath.getFileSystem(conf);
    for (FileStatus e : fs.listStatus(new Path(outputPath, Bytes.toString(TEST_FAMILY)))) {

View Full Code Here

   * files in that path. The method return back a {@link PObject} containing a
   * {@link Map} having file names as keys and filters as values
   */
  public static PObject<Map<String, BloomFilter>> createFilter(Path inputPath, BloomFilterFn<String> filterFn)
      throws IOException {
    MRPipeline pipeline = new MRPipeline(BloomFilterFactory.class);
    FileStatus[] listStatus = FileSystem.get(pipeline.getConfiguration()).listStatus(inputPath);
    PTable<String, BloomFilter> filterTable = null;
    for (FileStatus fileStatus : listStatus) {
      Path path = fileStatus.getPath();
      PCollection<String> readTextFile = pipeline.readTextFile(path.toString());
      pipeline.getConfiguration().set(BloomFilterFn.CRUNCH_FILTER_NAME, path.getName());
      PTable<String, BloomFilter> currentTable = createFilterTable(readTextFile, filterFn);
      if (filterTable != null) {
        filterTable = filterTable.union(currentTable);
      } else {
        filterTable = currentTable;

View Full Code Here

    server.stop();
  }


  @Test
  public void testReadFromSource() throws Exception {
    Pipeline pipeline = new MRPipeline(DataBaseSourceIT.class);
    DataBaseSource<IdentifiableName> dbsrc = new DataBaseSource.Builder<IdentifiableName>(IdentifiableName.class)
        .setDriverClass(org.h2.Driver.class)
        .setUrl("jdbc:h2:tcp://localhost/~/test").setUsername("sa").setPassword("")
        .selectSQLQuery("SELECT ID, NAME FROM TEST").countSQLQuery("select count(*) from Test").build();


    PCollection<IdentifiableName> cdidata = pipeline.read(dbsrc);
    PCollection<String> names = cdidata.parallelDo(new DoFn<IdentifiableName, String>() {


      @Override
      public void process(IdentifiableName input, Emitter<String> emitter) {
        emitter.emit(input.name.toString());
      }


    }, Writables.strings());


    List<String> nameList = Lists.newArrayList(names.materialize());
    pipeline.done();


    assertEquals(2, nameList.size());
    assertEquals(Sets.newHashSet("Hello", "World"), Sets.newHashSet(nameList));


  }

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Combiner used for summing up response size and count
    CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS,
        CombineFn.SUM_LONGS);


    // Table of (ip, sum(response size), count)
    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
        .parallelDo(extractResponseSize,
            Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
        .combineValues(stringPairOfLongsSumCombiner);


    // Calculate average response size by ip address
    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
        Writables.tableOf(Writables.strings(), Writables.doubles()));


    // write the result to a text file
    pipeline.writeTextFile(avgs, args[1]);
    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Combiner used for summing up response size
    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();


    // Table of (ip, sum(response size))
    PTable<String, Long> ipAddrResponseSize = lines
        .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey()
        .combineValues(longSumCombiner);


    pipeline.writeTextFile(ipAddrResponseSize, args[1]);
    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

    createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));


    putInHbase(putList, configuration);


    // We create the pipeline which will handle most of the job.
    Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());


    // The scan which will retrieve the data from the source in hbase.
    Scan scan = new Scan();
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);


    // Our hbase source
    HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);


    // Our source, in a format which can be use by crunch
    PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);


    // We process the data from the source HTable then concatenate all data
    // with the same rowkey
    PTable<String, String> textExtracted = extractText(rawText);
    CombineFn<String, String> stringConcatCombine = CombineFn.STRING_CONCAT(" ", true);
    PTable<String, String> result = textExtracted.groupByKey().combineValues(stringConcatCombine);


    // We create the collection of puts from the concatenated datas
    PCollection<Put> resultPut = createPut(result);


    // We write the puts in hbase, in the target table
    pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));


    pipeline.done();
    return 0;
  }

View Full Code Here

  @Rule
  public transient TemporaryPath tmpDir = TemporaryPaths.create();


  @Test
  public void testWritableSortAsc() throws Exception {
    runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), Order.ASCENDING,
        "A\tand this text as well");
  }

View Full Code Here

        "A\tand this text as well");
  }


  @Test
  public void testWritableSortDesc() throws Exception {
    runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), Order.DESCENDING,
        "B\tthis doc has some text");
  }

View Full Code Here

        "B\tthis doc has some text");
  }


  @Test
  public void testWritableSortAscDesc() throws Exception {
    runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING), "A",
        "this doc has this text");
  }

View Full Code Here

        "this doc has this text");
  }


  @Test
  public void testWritableSortSecondDescFirstAsc() throws Exception {
    runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(2, DESCENDING), by(1, ASCENDING), "A",
        "this doc has this text");
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.crunch.impl.mr.MRPipeline

com.cloudera.cdk.data.crunch.TestCrunchDatasets

org.apache.bigtop.bigpetstore.etl.CrunchETL

org.apache.crunch.Breakpoint2IT

org.apache.crunch.BreakpointIT

org.apache.crunch.CancelJobsIT

org.apache.crunch.CheckpointIT

org.apache.crunch.CleanTextIT

org.apache.crunch.CollectionPObjectIT

org.apache.crunch.CollectionsIT

org.apache.crunch.CollectionsLengthIT

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.