Examples of org.apache.crunch.Pipeline

org.apache.crunch.Pipeline
Manages the state of a pipeline execution.

  }


  /** See CRUNCH-251 */
  @Test
  public void testMultipleHFileTargets() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath1 = getTempPathOnHDFS("out1");
    Path outputPath2 = getTempPathOnHDFS("out2");
    HTable table1 = createTable(26);
    HTable table2 = createTable(26);
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration());


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PCollection<String> shortWords = words.filter(SHORT_WORD_FILTER);
    PCollection<String> longWords = words.filter(FilterFns.not(SHORT_WORD_FILTER));
    PTable<String, Long> shortWordCounts = shortWords.count();
    PTable<String, Long> longWordCounts = longWords.count();
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(shortWordCounts),
        table1,
        outputPath1);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(longWordCounts),
        table2,
        outputPath2);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    loader.doBulkLoad(outputPath1, table1);
    loader.doBulkLoad(outputPath2, table2);

View Full Code Here

  @Test
  public void testHFileUsesFamilyConfig() throws Exception {
    DataBlockEncoding newBlockEncoding = DataBlockEncoding.PREFIX;
    assertNotSame(newBlockEncoding, DataBlockEncoding.valueOf(HColumnDescriptor.DEFAULT_DATA_BLOCK_ENCODING));


    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
    hcol.setDataBlockEncoding(newBlockEncoding);
    HTable testTable = createTable(26, hcol);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    int hfilesCount = 0;
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    FileSystem fs = outputPath.getFileSystem(conf);

View Full Code Here

    createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));


    putInHbase(putList, configuration);


    // We create the pipeline which will handle most of the job.
    Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());


    // The scan which will retrieve the data from the source in hbase.
    Scan scan = new Scan();
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);


    // Our hbase source
    HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);


    // Our source, in a format which can be use by crunch
    PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);


    // We process the data from the source HTable then concatenate all data
    // with the same rowkey
    PTable<String, String> textExtracted = extractText(rawText);
    PTable<String, String> result = textExtracted.groupByKey()
        .combineValues(Aggregators.STRING_CONCAT(" ",  true));


    // We create the collection of puts from the concatenated datas
    PCollection<Put> resultPut = createPut(result);


    // We write the puts in hbase, in the target table
    pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));


    pipeline.done();
    return 0;
  }

View Full Code Here

  
  private <T> void testSourceTarget(PType<T> ptype, T expected) {
    Path inputPath = new Path(tempPath, "input.orc");
    Path outputPath = new Path(tempPath, "output");
    
    Pipeline pipeline = new MRPipeline(OrcFileSourceTargetIT.class, conf);
    OrcFileSource<T> source = new OrcFileSource<T>(inputPath, ptype);
    PCollection<T> rows = pipeline.read(source);
    List<T> result = Lists.newArrayList(rows.materialize());
    
    assertEquals(Lists.newArrayList(expected), result);
    
    OrcFileTarget target = new OrcFileTarget(outputPath);
    pipeline.write(rows, target);
    
    assertTrue(pipeline.done().succeeded());
    
    OrcFileReaderFactory<T> reader = new OrcFileReaderFactory<T>(ptype);
    List<T> newResult = Lists.newArrayList(reader.read(fs, inputPath));
    
    assertEquals(Lists.newArrayList(expected), newResult);

View Full Code Here

  
  @Test
  public void testColumnPruning() throws IOException {
    generateInputData();
    
    Pipeline pipeline = new MRPipeline(OrcFileSourceTargetIT.class, conf);
    int[] readColumns = {0, 1};
    OrcFileSource<Person> source = new OrcFileSource<Person>(new Path(tempPath, "input.orc"),
        Orcs.reflects(Person.class), readColumns);
    PCollection<Person> rows = pipeline.read(source);
    List<Person> result = Lists.newArrayList(rows.materialize());
    
    Person expected = new Person("Alice", 23, null);
    assertEquals(Lists.newArrayList(expected), result);
  }

View Full Code Here

    writer.write(s2);
    writer.write(s3);
    writer.write(s4);
    writer.close();
    
    Pipeline pipeline = new MRPipeline(OrcFileSourceTargetIT.class, conf);
    OrcFileSource<Person> source = new OrcFileSource<Person>(inputPath, Orcs.reflects(Person.class));
    PCollection<Person> rows = pipeline.read(source);
    PTable<Person, Long> count = rows.count();


    List<Pair<Person, Long>> result = Lists.newArrayList(count.materialize());
    List<Pair<Person, Long>> expected = Lists.newArrayList(
        Pair.of(new Person("Alice", 23, Arrays.asList("444-333-9999")), 1L),

View Full Code Here

    List<KeyValue> kvs = generateKeyValues(100);
    Path inputPath = tmpDir.getPath("in");
    Path outputPath = tmpDir.getPath("out");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> in = pipeline.read(FromHBase.hfile(inputPath));
    PCollection<String> texts = in.parallelDo(new MapFn<KeyValue, String>() {
      @Override
      public String map(KeyValue input) {
        return input.toString();
      }
    }, strings());
    texts.write(To.textFile(outputPath));
    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    List<String> lines = FileUtils.readLines(new File(outputPath.toString(), "part-m-00000"));
    assertEquals(kvs.size(), lines.size());
    for (int i = 0; i < kvs.size(); i++) {

View Full Code Here


  private List<Result> doTestScanHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<Result> results = HFileUtils.scanHFiles(pipeline, inputPath, scan);
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here


  private List<KeyValue> doTestReadHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> results = pipeline.read(FromHBase.hfile(inputPath));
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here


public class CountersTest {


  @Test
  public void counterTest() throws Exception {
    Pipeline pipeline = MemPipeline.getInstance();


    // Single row PCollection.
    PCollection<String> objects = MemPipeline.collectionOf(Arrays.asList(new String[]{"hello world"}));
    System.out.println("Objects: " + ((MemCollection) objects).getCollection());


    // Counter creating Map.
    PCollection<String> objects2 = objects.parallelDo("Create counters",
        new MapFn<String, String>() {
          @Override
          public String map(String input) {
            for(int i = 0; i < 200; ++i) {
              this.increment("testCounter", String.valueOf(i));
            }
            return input;
          }
        },
        Writables.strings()
    );


    // Run it!
    pipeline.done();
    System.out.println("Objects2: " + ((MemCollection) objects2).getCollection());
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.crunch.Pipeline

$.WordCount

com.cloudera.cdk.data.crunch.TestCrunchDatasets

org.apache.bigtop.bigpetstore.etl.CrunchETL

org.apache.crunch.contrib.io.jdbc.DataBaseSourceIT

org.apache.crunch.examples.AverageBytesByIP

org.apache.crunch.examples.SecondarySortExample

org.apache.crunch.examples.TotalBytesByIP

org.apache.crunch.examples.TotalWordCount

org.apache.crunch.examples.WordAggregationHBase

org.apache.crunch.examples.WordCount

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.