Package org.apache.crunch

Examples of org.apache.crunch.Pipeline


      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Combiner used for summing up response size
    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();

    // Table of (ip, sum(response size))
    PTable<String, Long> ipAddrResponseSize = lines
        .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey()
        .combineValues(longSumCombiner);

    pipeline.writeTextFile(ipAddrResponseSize, args[1]);
    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }
View Full Code Here


    createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));

    putInHbase(putList, configuration);

    // We create the pipeline which will handle most of the job.
    Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());

    // The scan which will retrieve the data from the source in hbase.
    Scan scan = new Scan();
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);

    // Our hbase source
    HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);

    // Our source, in a format which can be use by crunch
    PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);

    // We process the data from the source HTable then concatenate all data
    // with the same rowkey
    PTable<String, String> textExtracted = extractText(rawText);
    CombineFn<String, String> stringConcatCombine = CombineFn.STRING_CONCAT(" ", true);
    PTable<String, String> result = textExtracted.groupByKey().combineValues(stringConcatCombine);

    // We create the collection of puts from the concatenated datas
    PCollection<Put> resultPut = createPut(result);

    // We write the puts in hbase, in the target table
    pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));

    pipeline.done();
    return 0;
  }
View Full Code Here

        new ColumnOrder[] { by(1, ASCENDING), by(2, DESCENDING) }, new String[] { "A", "this doc has this text" });
  }

  @Test
  public void testAvroReflectSortPair() throws IOException {
    Pipeline pipeline = new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration());
    pipeline.enableDebug();
    String rsrc = tmpDir.copyResourceFileName("set2.txt");
    PCollection<Pair<String, StringWrapper>> in = pipeline.readTextFile(rsrc)
        .parallelDo(new MapFn<String, Pair<String, StringWrapper>>() {

          @Override
          public Pair<String, StringWrapper> map(String input) {
            return Pair.of(input, wrap(input));
View Full Code Here

    assertEquals(expected, Lists.newArrayList(sorted.materialize()));
  }

  @Test
  public void testAvroReflectSortTable() throws IOException {
    Pipeline pipeline = new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration());
    PTable<String, StringWrapper> unsorted = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt")).parallelDo(
        new MapFn<String, Pair<String, StringWrapper>>() {

          @Override
          public Pair<String, StringWrapper> map(String input) {
            return Pair.of(input, wrap(input));
View Full Code Here

  public void testSecondarySortWritables() throws Exception {
    runSecondarySort(WritableTypeFamily.getInstance());
  }

  public void runSecondarySort(PTypeFamily ptf) throws Exception {
    Pipeline p = new MRPipeline(SecondarySortIT.class, tempDir.getDefaultConfiguration());
    String inputFile = tempDir.copyResourceFileName("secondary_sort_input.txt");
   
    PTable<String, Pair<Integer, Integer>> in = p.read(From.textFile(inputFile))
        .parallelDo(new MapFn<String, Pair<String, Pair<Integer, Integer>>>() {
          @Override
          public Pair<String, Pair<Integer, Integer>> map(String input) {
            String[] pieces = input.split(",");
            return Pair.of(pieces[0],
                Pair.of(Integer.valueOf(pieces[1].trim()), Integer.valueOf(pieces[2].trim())));
          }
        }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.ints(), ptf.ints())));
    Iterable<String> lines = SecondarySort.sortAndApply(in, new MapFn<Pair<String, Iterable<Pair<Integer, Integer>>>, String>() {
      @Override
      public String map(Pair<String, Iterable<Pair<Integer, Integer>>> input) {
        Joiner j = Joiner.on(',');
        return j.join(input.first(), j.join(input.second()));
      }
    }, ptf.strings()).materialize();
    assertEquals(ImmutableList.of("one,[-5,10],[1,1],[2,-3]", "three,[0,-1]", "two,[1,7],[2,6],[4,5]"),
        ImmutableList.copyOf(lines));
    p.done();
  }
View Full Code Here

import static org.junit.Assert.assertFalse;

public class AvroPathPerKeyIT extends CrunchTestSupport implements Serializable {
  @Test
  public void testOutputFilePerKey() throws Exception {
    Pipeline p = new MRPipeline(AvroPathPerKeyIT.class, tempDir.getDefaultConfiguration());
    Path outDir = tempDir.getPath("out");
    p.read(From.textFile(tempDir.copyResourceFileName("docs.txt")))
        .parallelDo(new MapFn<String, Pair<String, String>>() {
          @Override
          public Pair<String, String> map(String input) {
            String[] p = input.split("\t");
            return Pair.of(p[0], p[1]);
          }
        }, Avros.tableOf(Avros.strings(), Avros.strings()))
        .groupByKey()
        .write(new AvroPathPerKeyTarget(outDir));
    p.done();

    Set<String> names = Sets.newHashSet();
    FileSystem fs = outDir.getFileSystem(tempDir.getDefaultConfiguration());
    for (FileStatus fstat : fs.listStatus(outDir)) {
      names.add(fstat.getPath().getName());
View Full Code Here

    assertEquals("part-r-00000.avro", bStat[0].getPath().getName());
  }

  @Test
  public void testOutputFilePerKey_NothingToOutput() throws Exception {
    Pipeline p = new MRPipeline(AvroPathPerKeyIT.class, tempDir.getDefaultConfiguration());
    Path outDir = tempDir.getPath("out");

    p.read(From.textFile(tempDir.copyResourceFileName("docs.txt")))
        .parallelDo(new MapFn<String, Pair<String, String>>() {
          @Override
          public Pair<String, String> map(String input) {
            String[] p = input.split("\t");
            return Pair.of(p[0], p[1]);
          }
        }, Avros.tableOf(Avros.strings(), Avros.strings()))
        .filter(FilterFns.<Pair<String, String>>REJECT_ALL())
        .groupByKey()
        .write(new AvroPathPerKeyTarget(outDir));
    p.done();

    FileSystem fs = outDir.getFileSystem(tempDir.getDefaultConfiguration());
    assertFalse(fs.exists(outDir));
  }
View Full Code Here

  @Test
  public void testInputFromMapReduceKeyValueFile_Generic() throws InterruptedException, IOException, ClassNotFoundException {

    Path keyValuePath = produceMapReduceOutputFile();

    Pipeline pipeline = new MRPipeline(AvroKeyValueIT.class, tempDir.getDefaultConfiguration());
    PTable<Person, Integer> personTable = pipeline.read(
        From.avroTableFile(keyValuePath, Avros.tableOf(Avros.specifics(Person.class), Avros.ints())));

    org.apache.crunch.Pair<Person, Integer> firstEntry = Iterables.getFirst(personTable.materialize(), null);

    assertEquals("a", firstEntry.first().getName().toString());
    assertEquals(Integer.valueOf(1), firstEntry.second());

    pipeline.done();

  }
View Full Code Here

  @Test
  public void testInputFromMapRedKeyValueFile_Specific() throws IOException {
    Path keyValuePath = produceMapRedOutputFile();

    Pipeline pipeline = new MRPipeline(AvroKeyValueIT.class, tempDir.getDefaultConfiguration());
    PTable<Person, Integer> personTable = pipeline.read(
        From.avroTableFile(keyValuePath, Avros.keyValueTableOf(Avros.specifics(Person.class), Avros.ints())));

    org.apache.crunch.Pair<Person, Integer> firstEntry = Iterables.getFirst(personTable.materialize(), null);

    assertEquals("a", firstEntry.first().getName().toString());
    assertEquals(Integer.valueOf(1), firstEntry.second());

    // Verify that deep copying on this PType works as well
    PTableType<Person, Integer> tableType = Avros.keyValueTableOf(Avros.specifics(Person.class), Avros.ints());
    tableType.initialize(tempDir.getDefaultConfiguration());
    org.apache.crunch.Pair<Person, Integer> detachedPair = tableType.getDetachedValue(firstEntry);
    assertEquals(firstEntry, detachedPair);

    pipeline.done();
  }
View Full Code Here

  @Test
  public void testInputFromMapRedKeyValueFile_Reflect() throws IOException {
    Path keyValuePath = produceMapRedOutputFile();

    Pipeline pipeline = new MRPipeline(AvroKeyValueIT.class, tempDir.getDefaultConfiguration());
    PTable<ReflectedPerson, Integer> personTable = pipeline.read(
        From.avroTableFile(keyValuePath, Avros.keyValueTableOf(Avros.reflects(ReflectedPerson.class), Avros.ints())));

    org.apache.crunch.Pair<ReflectedPerson, Integer> firstEntry = Iterables.getFirst(personTable.materialize(), null);

    assertEquals("a", firstEntry.first().getName().toString());
    assertEquals(Integer.valueOf(1), firstEntry.second());

    // Verify that deep copying on this PType works as well
    PTableType<ReflectedPerson, Integer> tableType =
        Avros.keyValueTableOf(Avros.reflects(ReflectedPerson.class), Avros.ints());
    tableType.initialize(tempDir.getDefaultConfiguration());
    org.apache.crunch.Pair<ReflectedPerson, Integer> detachedPair = tableType.getDetachedValue(firstEntry);
    assertEquals(firstEntry, detachedPair);

    pipeline.done();
  }
View Full Code Here

TOP

Related Classes of org.apache.crunch.Pipeline

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.