Examples of org.apache.crunch.Pipeline

org.apache.crunch.Pipeline
Manages the state of a pipeline execution.

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);


    Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<GenericData.Record> genericCollection = pipeline.read(From.avroFile(
        new Path(avroFile.getAbsolutePath()),
        tmpDir.getDefaultConfiguration()));


    List<GenericData.Record> personList = Lists.newArrayList(genericCollection.materialize());

View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), genericPersonSchema);


    Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Record> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.generics(genericPersonSchema)));


    List<Record> recordList = Lists.newArrayList(genericCollection.materialize());


    assertEquals(Lists.newArrayList(savedRecord), Lists.newArrayList(recordList));

View Full Code Here

    Schema pojoPersonSchema = ReflectData.get().getSchema(StringWrapper.class);
    GenericRecord savedRecord = new GenericData.Record(pojoPersonSchema);
    savedRecord.put("value", "stringvalue");
    populateGenericFile(Lists.newArrayList(savedRecord), pojoPersonSchema);


    Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<StringWrapper> stringValueCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.reflects(StringWrapper.class)));


    List<StringWrapper> recordList = Lists.newArrayList(stringValueCollection.materialize());


    assertEquals(1, recordList.size());

View Full Code Here

    Configuration configuration = tmpDir.getDefaultConfiguration();


    // small io.sort.mb to make the test run faster with less resources
    configuration.set("io.sort.mb", "1");


    Pipeline pipeline = new MRPipeline(SafeAvroSerializationIT.class,
        configuration);


    Schema schema = new Schema.Parser().parse(tmpDir
        .copyResourceFile("CRUNCH-316.avsc"));


    PTable<String, GenericData.Record> leftSide = pipeline.read(
        At.avroFile(
            new Path(populateLeftSide(schema).getAbsolutePath()),
            Avros.generics(schema))).by(
        new MapFn<GenericData.Record, String>() {
          @Override
          public String map(GenericData.Record input) {
            return (String) input.get("tag").toString();
          }
        }, Avros.strings());


    PTable<String, String> rightSide = pipeline.read(
        At.avroFile(new Path(populateRightSide().getAbsolutePath()),
            Avros.strings())).by(new MapFn<String, String>() {
      @Override
      public String map(String input) {
        return input;

View Full Code Here

  @Test
  public void testGenericReflectConflict() throws IOException {
    final Random rand = new Random();
    rand.setSeed(12345);
    Configuration conf = new Configuration();
    Pipeline pipeline = new MRPipeline(AvroModeIT.class, conf);
    Source<GenericData.Record> source = From.avroFile(
        tmpDir.copyResourceFileName("strings-100.avro"),
        Avros.generics(GENERIC_SCHEMA));
    PTable<Long, float[]> mapPhase = pipeline
        .read(source)
        .parallelDo(new DoFn<GenericData.Record, Pair<Long, float[]>>() {
          @Override
          public void process(GenericData.Record input, Emitter<Pair<Long, float[]>> emitter) {
            emitter.emit(Pair.of(
                Long.valueOf(input.get("text").toString().length()),
                new float[] {rand.nextFloat(), rand.nextFloat()}));
          }
        }, Avros.tableOf(Avros.longs(), FLOAT_ARRAY));


    PTable<Long, float[]> result = mapPhase
        .groupByKey()
        .combineValues(new Aggregator<float[]>() {
          float[] accumulator = null;


          @Override
          public Iterable<float[]> results() {
            return ImmutableList.of(accumulator);
          }


          @Override
          public void initialize(Configuration conf) {
          }


          @Override
          public void reset() {
            this.accumulator = null;
          }


          @Override
          public void update(float[] value) {
            if (accumulator == null) {
              accumulator = Arrays.copyOf(value, 2);
            } else {
              for (int i = 0; i < value.length; i += 1) {
                accumulator[i] += value[i];
              }
            }
          }
        });


    pipeline.writeTextFile(result, tmpDir.getFileName("unused"));
    Assert.assertTrue("Should succeed", pipeline.done().succeeded());
  }

View Full Code Here

  @Test
  public void testVanillaCSVWithAdditionalActions() throws Exception {
    final String[] expectedFileContents = { "1,2,3,4", "5,6,7,8", "9,10,11", "12,13,14" };


    final String vanillaCSVFile = tmpDir.copyResourceFileName("vanilla.csv");
    final Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    final PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(vanillaCSVFile)));


    final PTable<String, Long> countTable = csvLines.count();
    final PCollection<String> csvLines2 = countTable.keys();
    final Collection<String> csvLinesList = csvLines2.asCollection().getValue();

View Full Code Here

    final String[] expectedFileContents = {
        "\"Champion, Mac\",\"1234 Hoth St.\n\tApartment 101\n\tAtlanta, GA\n\t64086\",\"30\",\"M\",\"5/28/2010 12:00:00 AM\",\"Just some guy\"",
        "\"Champion, Mac\",\"5678 Tatooine Rd. Apt 5, Mobile, AL 36608\",\"30\",\"M\",\"Some other date\",\"short description\"" };


    final String csvWithNewlines = tmpDir.copyResourceFileName("withNewlines.csv");
    final Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    final PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(csvWithNewlines)));


    final Collection<String> csvLinesList = csvLines.asCollection().getValue();


    for (int i = 0; i < expectedFileContents.length; i++) {
      assertTrue(csvLinesList.contains(expectedFileContents[i]));

View Full Code Here

    final String[] expectedFileContents = {
        "*Champion, Mac*,*1234 Hoth St.\n\tApartment 101\n\tAtlanta, GA\n\t64086*,*30*,*M*,*5/28/2010 12:00:00 AM*,*Just some guy*",
        "*Mac, Champion*,*5678 Tatooine Rd. Apt 5, Mobile, AL 36608*,*30*,*M*,*Some other date*,*short description*" };


    final String csvWithNewlines = tmpDir.copyResourceFileName("customQuoteCharWithNewlines.csv");
    final Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    final PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(csvWithNewlines),
        CSVLineReader.DEFAULT_BUFFER_SIZE, CSVLineReader.DEFAULT_INPUT_FILE_ENCODING, '*', '*',
        CSVLineReader.DEFAULT_ESCAPE_CHARACTER, CSVLineReader.DEFAULT_MAXIMUM_RECORD_SIZE));


    final Collection<String> csvLinesList = csvLines.asCollection().getValue();

View Full Code Here

  public void testBrokenLineParsingInChinese() throws IOException {
    final String[] expectedChineseLines = { "您好我叫马克，我从亚拉巴马州来，我是软件工程师，我二十八岁", "我有一个宠物，它是一个小猫，它六岁，它很漂亮",
        "我喜欢吃饭，“我觉得这个饭最好\n＊蛋糕\n＊包子\n＊冰淇淋\n＊啤酒“，他们都很好，我也很喜欢奶酪但它是不健康的", "我是男的，我的头发很短，我穿蓝色的裤子，“我穿黑色的、“衣服”" };
    final String chineseLines = tmpDir.copyResourceFileName("brokenChineseLines.csv");


    final Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    final PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(chineseLines),
        CSVLineReader.DEFAULT_BUFFER_SIZE, CSVLineReader.DEFAULT_INPUT_FILE_ENCODING, '“', '”', '、',
        CSVLineReader.DEFAULT_MAXIMUM_RECORD_SIZE));
    final Collection<String> csvLinesList = csvLines.asCollection().getValue();
    for (int i = 0; i < expectedChineseLines.length; i++) {
      assertTrue(csvLinesList.contains(expectedChineseLines[i]));

View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);


    Pipeline pipeline = new MRPipeline(AvroParquetPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.records(Person.class)));
    File outputFile = tmpDir.getFile("output");
    Target parquetFileTarget = new AvroParquetFileTarget(outputFile.getAbsolutePath());
    pipeline.write(genericCollection, parquetFileTarget);
    pipeline.run();


    Person person = genericCollection.materialize().iterator().next();


    Path parquetFile = new Path(new File(outputFile, "part-m-00000.parquet").getPath());

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.crunch.Pipeline

$.WordCount

com.cloudera.cdk.data.crunch.TestCrunchDatasets

org.apache.bigtop.bigpetstore.etl.CrunchETL

org.apache.crunch.contrib.io.jdbc.DataBaseSourceIT

org.apache.crunch.examples.AverageBytesByIP

org.apache.crunch.examples.SecondarySortExample

org.apache.crunch.examples.TotalBytesByIP

org.apache.crunch.examples.TotalWordCount

org.apache.crunch.examples.WordAggregationHBase

org.apache.crunch.examples.WordCount

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.