Package org.apache.crunch

Examples of org.apache.crunch.Pipeline


  public transient TemporaryPath tmpDir = TemporaryPaths.create();
 
  @Test(expected=CrunchRuntimeException.class)
  public void testTextToAvro() throws Exception {
    String shakes = tmpDir.copyResourceFileName("shakes.txt");
    Pipeline pipeline = new MRPipeline(TextToAvroIT.class, tmpDir.getDefaultConfiguration());
    pipeline.read(From.textFile(shakes)).write(To.avroFile("output"));
    pipeline.run();
  }
View Full Code Here


    new File(infilename).getParentFile().mkdirs();

    writeFile(expected, infilename);

    Pipeline memPipeline = MemPipeline.getInstance();
    PCollection<String> memPColl = memPipeline.readTextFile(infilename);
    Target memTarget = new TextFileTarget(memOutFilename);
    memPipeline.write(memPColl, memTarget, WriteMode.OVERWRITE);
    memPipeline.run();
    File outDir = new File(memOutFilename);
    File actualMemOut = null;
    for (File f : outDir.listFiles()) {
      String name = f.getName();
      if (name.contains("out") && name.endsWith(".txt")) {
        actualMemOut = f;
        break;
      }
    }
    String actualMemText = Files.readFirstLine(actualMemOut, Charsets.UTF_8);

    Pipeline mrPipeline = new MRPipeline(getClass());
    PCollection<String> mrPColl = mrPipeline.readTextFile(infilename);
    Target mrTarget = new TextFileTarget(mrOutFilename);
    mrPipeline.write(mrPColl, mrTarget, WriteMode.OVERWRITE);
    mrPipeline.run();
    String actualMrText = Files.readFirstLine(new File(mrOutFilename + "/part-m-00000"), Charsets.UTF_8);

    Assert.assertEquals("MR file mismatch", expected, actualMrText);
    Assert.assertEquals("Mem file mismatch", expected, actualMemText);
  }
View Full Code Here

    }
  }

  @Test
  public void testMapper() throws Exception {
    Pipeline p = new MRPipeline(MapreduceIT.class, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {
      @Override
      public Pair<IntWritable, Text> map(String input) {
        return Pair.of(new IntWritable(input.length()), new Text(input));
      }
    }, Writables.tableOf(Writables.writables(IntWritable.class), Writables.writables(Text.class)));
   
    PTable<IntWritable, Text> out = Mapreduce.map(two, TestMapper.class, IntWritable.class, Text.class);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    assertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
  }
View Full Code Here

    assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
  }
 
  @Test
  public void testReducer() throws Exception {
    Pipeline p = new MRPipeline(MapredIT.class, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {
      @Override
      public Pair<IntWritable, Text> map(String input) {
        return Pair.of(new IntWritable(input.length()), new Text(input));
      }
    }, Writables.tableOf(Writables.writables(IntWritable.class), Writables.writables(Text.class)));
   
    PTable<Text, LongWritable> out = Mapreduce.reduce(two.groupByKey(), TestReducer.class, Text.class, LongWritable.class);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    assertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    assertEquals(19, sr.getCounters().findCounter("words", "where").getValue());
  }
View Full Code Here

  }

  @Test
  public void testMemPipelineFileWriter() throws Exception {
    File outputDir = baseTmpDir.getFile("mempipe");
    Pipeline p = MemPipeline.getInstance();
    PCollection<String> lines = MemPipeline.collectionOf("hello", "world");
    p.writeTextFile(lines, outputDir.toString());
    p.done();
    File outputFile = getOutputFile(outputDir, "*.txt");

    List<String> txt = Files.readLines(outputFile, Charsets.UTF_8);
    assertEquals(ImmutableList.of("hello", "world"), txt);
  }
View Full Code Here

    }
  }
 
  @Test
  public void testMapper() throws Exception {
    Pipeline p = new MRPipeline(MapredIT.class, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {
      @Override
      public Pair<IntWritable, Text> map(String input) {
        return Pair.of(new IntWritable(input.length()), new Text(input));
      }
    }, Writables.tableOf(Writables.writables(IntWritable.class), Writables.writables(Text.class)));
   
    PTable<Text, LongWritable> out = Mapred.map(two, TestMapper.class, Text.class, LongWritable.class);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    assertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
  }
View Full Code Here

    assertEquals(3667, sr.getCounters().findCounter("written", "out").getValue());
  }
 
  @Test
  public void testReducer() throws Exception {
    Pipeline p = new MRPipeline(MapredIT.class, tempDir.getDefaultConfiguration());
    Path shakesPath = tempDir.copyResourcePath("shakes.txt");
    PCollection<String> in = p.read(From.textFile(shakesPath));
    PTable<IntWritable, Text> two = in.parallelDo(new MapFn<String, Pair<IntWritable, Text>>() {
      @Override
      public Pair<IntWritable, Text> map(String input) {
        return Pair.of(new IntWritable(input.length()), new Text(input));
      }
    }, Writables.tableOf(Writables.writables(IntWritable.class), Writables.writables(Text.class)));
   
    PTable<Text, LongWritable> out = Mapred.reduce(two.groupByKey(), TestReducer.class, Text.class, LongWritable.class);
    out.write(To.sequenceFile(tempDir.getPath("temp")));
    PipelineResult res = p.done();
    assertEquals(1, res.getStageResults().size());
    StageResult sr = res.getStageResults().get(0);
    assertEquals(108, sr.getCounters().findCounter("thou", "count").getValue());
  }
View Full Code Here

public class CountersTest implements Serializable {

  @Test
  public void counterTest() throws Exception {
    Pipeline pipeline = MemPipeline.getInstance();

    // Single row PCollection.
    PCollection<String> objects = MemPipeline.collectionOf(Arrays.asList(new String[]{"hello world"}));
    System.out.println("Objects: " + ((MemCollection) objects).getCollection());

    // Counter creating Map.
    PCollection<String> objects2 = objects.parallelDo("Create counters",
        new MapFn<String, String>() {
          @Override
          public String map(String input) {
            for(int i = 0; i < 200; ++i) {
              this.increment("testCounter", String.valueOf(i));
            }
            return input;
          }
        },
        Writables.strings()
    );

    // Run it!
    pipeline.done();
    System.out.println("Objects2: " + ((MemCollection) objects2).getCollection());
  }
View Full Code Here

  @Test
  public void testVanillaCSV() throws Exception {
    final String[] expectedFileContents = { "1,2,3,4", "5,6,7,8", "9,10,11", "12,13,14" };

    final String vanillaCSVFile = tmpDir.copyResourceFileName("vanilla.csv");
    final Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    final PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(vanillaCSVFile)));

    final Collection<String> csvLinesList = csvLines.asCollection().getValue();

    for (int i = 0; i < expectedFileContents.length; i++) {
      assertTrue(csvLinesList.contains(expectedFileContents[i]));
View Full Code Here

  @Test
  public void testShard() throws Exception {
    File inDir = tempDir.getFile("in");
    FileUtils.writeLines(new File(inDir, "part1"), ImmutableList.of("part1", "part1"));
    FileUtils.writeLines(new File(inDir, "part2"), ImmutableList.of("part2"));
    Pipeline pipeline = new MRPipeline(ShardIT.class);
    PCollection<String> in = pipeline.read(From.textFile(inDir.getPath()));
    // We can only test on 1 shard here, as local MR does not support multiple reducers.
    PCollection<String> out = Shard.shard(in, 1);
    assertEquals(
        ImmutableMultiset.copyOf(out.materialize()),
        ImmutableMultiset.of("part1", "part1", "part2"));
View Full Code Here

TOP

Related Classes of org.apache.crunch.Pipeline

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.