Examples of MRPipeline

com.cloudera.crunch.impl.mr.MRPipeline
org.apache.crunch.impl.mr.MRPipeline
Pipeline implementation that is executed within Hadoop MapReduce.

Examples of com.cloudera.crunch.impl.mr.MRPipeline


public class SortCrunchTest implements Serializable {
  
  @Test
  public void test() throws IOException {
    Pipeline pipeline = new MRPipeline(SortCrunchTest.class);
    PCollection<String> records = pipeline.readTextFile("sort/A");
    
    PCollection<Pair<Integer, Integer>> pairs = records.parallelDo(new DoFn<String, Pair<Integer, Integer>>() {
      @Override
      public void process(String input, Emitter<Pair<Integer, Integer>> emitter) {
        Iterator<String> split = Splitter.on('\t').split(input).iterator();
        String l = split.next();
        String r = split.next();
        emitter.emit(Pair.of(Integer.parseInt(l), Integer.parseInt(r)));
      }
    }, pairs(ints(), ints()));
    
    PCollection<Pair<Integer, Integer>> sorted = Sort.sortPairs(pairs, by(1, ASCENDING), by(2, DESCENDING));
    
    pipeline.writeTextFile(sorted, "output-sorted");
    pipeline.run();
  }

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

import static org.junit.Assert.assertEquals;


public class AvroPathPerKeyIT extends CrunchTestSupport implements Serializable {
  @Test
  public void testOutputFilePerKey() throws Exception {
    Pipeline p = new MRPipeline(AvroPathPerKeyIT.class, tempDir.getDefaultConfiguration());
    Path outDir = tempDir.getPath("out");
    p.read(From.textFile(tempDir.copyResourceFileName("docs.txt")))
        .parallelDo(new MapFn<String, Pair<String, String>>() {
          @Override
          public Pair<String, String> map(String input) {
            String[] p = input.split("\t");
            return Pair.of(p[0], p[1]);
          }
        }, Avros.tableOf(Avros.strings(), Avros.strings()))
        .groupByKey()
        .write(new AvroPathPerKeyTarget(outDir));
    p.done();


    Set<String> names = Sets.newHashSet();
    FileSystem fs = outDir.getFileSystem(tempDir.getDefaultConfiguration());
    for (FileStatus fstat : fs.listStatus(outDir)) {
      names.add(fstat.getPath().getName());

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    fs.delete(TEMP_DIR, true);
  }


  @Test
  public void testHFileTarget() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String, Long> wordCounts = words.count();
    pipeline.write(convertToKeyValues(wordCounts), ToHBase.hfile(outputPath));


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
    KeyValue kv = readFromHFiles(fs, outputPath, "and");
    assertEquals(427L, Bytes.toLong(kv.getValue()));

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    assertEquals(427L, Bytes.toLong(kv.getValue()));
  }


  @Test
  public void testBulkLoad() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HTable testTable = createTable(26);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration())
        .doBulkLoad(outputPath, testTable);

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

  }


  /** See CRUNCH-251 */
  @Test
  public void testMultipleHFileTargets() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath1 = getTempPathOnHDFS("out1");
    Path outputPath2 = getTempPathOnHDFS("out2");
    HTable table1 = createTable(26);
    HTable table2 = createTable(26);
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration());


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PCollection<String> shortWords = words.filter(SHORT_WORD_FILTER);
    PCollection<String> longWords = words.filter(FilterFns.not(SHORT_WORD_FILTER));
    PTable<String, Long> shortWordCounts = shortWords.count();
    PTable<String, Long> longWordCounts = longWords.count();
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(shortWordCounts),
        table1,
        outputPath1);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(longWordCounts),
        table2,
        outputPath2);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    loader.doBulkLoad(outputPath1, table1);
    loader.doBulkLoad(outputPath2, table2);

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

  @Test
  public void testHFileUsesFamilyConfig() throws Exception {
    DataBlockEncoding newBlockEncoding = DataBlockEncoding.PREFIX;
    assertNotSame(newBlockEncoding, DataBlockEncoding.valueOf(HColumnDescriptor.DEFAULT_DATA_BLOCK_ENCODING));


    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
    hcol.setDataBlockEncoding(newBlockEncoding);
    HTable testTable = createTable(26, hcol);


    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);


    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    int hfilesCount = 0;
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    FileSystem fs = outputPath.getFileSystem(conf);

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

  private PCollection<String> lines3;
  private PCollection<String> lines4;


  @Before
  public void setUp() throws IOException {
    pipeline = new MRPipeline(CogroupIT.class, tmpDir.getDefaultConfiguration());
    lines1 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src1.txt")));
    lines2 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src2.txt")));
    lines3 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src1.txt")));
    lines4 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src2.txt")));
  }

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    List<KeyValue> kvs = generateKeyValues(100);
    Path inputPath = tmpDir.getPath("in");
    Path outputPath = tmpDir.getPath("out");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> in = pipeline.read(FromHBase.hfile(inputPath));
    PCollection<String> texts = in.parallelDo(new MapFn<KeyValue, String>() {
      @Override
      public String map(KeyValue input) {
        return input.toString();
      }
    }, strings());
    texts.write(To.textFile(outputPath));
    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());


    List<String> lines = FileUtils.readLines(new File(outputPath.toString(), "part-m-00000"));
    assertEquals(kvs.size(), lines.size());
    for (int i = 0; i < kvs.size(); i++) {

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline


  private List<Result> doTestScanHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<Result> results = HFileUtils.scanHFiles(pipeline, inputPath, scan);
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline


  private List<KeyValue> doTestReadHFiles(List<KeyValue> kvs, Scan scan) throws IOException {
    Path inputPath = tmpDir.getPath("in");
    writeKeyValuesToHFile(inputPath, kvs);


    Pipeline pipeline = new MRPipeline(HFileSourceIT.class, conf);
    PCollection<KeyValue> results = pipeline.read(FromHBase.hfile(inputPath));
    return ImmutableList.copyOf(results.materialize());
  }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.