Package org.apache.crunch

Examples of org.apache.crunch.Pipeline


    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericParquetFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(
        new AvroParquetFileSource<Person>(new Path(avroFile.getAbsolutePath()), Avros.records(Person.class)));
    File outputFile = tmpDir.getFile("output");
    Target parquetFileTarget = new AvroParquetFileTarget(outputFile.getAbsolutePath());
    pipeline.write(genericCollection, parquetFileTarget);
    pipeline.run();

    Person person = genericCollection.materialize().iterator().next();

    Path parquetFile = new Path(new File(outputFile, "part-m-00000.parquet").getPath());
View Full Code Here


    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.records(Person.class)));

    PCollection<Employee> employees = genericCollection.parallelDo(new DoFn<Person, Employee>() {
      @Override
      public void process(Person person, Emitter<Employee> emitter) {
        emitter.emit(new Employee(person.getName(), 0, "Eng"));
      }
    }, Avros.records(Employee.class));

    File output1File = tmpDir.getFile("output1");
    File output2File = tmpDir.getFile("output2");
    pipeline.write(genericCollection, new AvroParquetFileTarget(output1File.getAbsolutePath()));
    pipeline.write(employees, new AvroParquetFileSourceTarget(new Path(output2File.getAbsolutePath()),
        Avros.records(Employee.class)));
    pipeline.run();

    Person person = genericCollection.materialize().iterator().next();
    Employee employee = employees.materialize().iterator().next();

    Path parquet1File = new Path(new File(output1File, "part-m-00000.parquet").getPath());
View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.records(Person.class)));
    File outputFile = tmpDir.getFile("output");
    Target parquetFileTarget = new AvroParquetFileTarget(outputFile.getAbsolutePath());
    pipeline.write(genericCollection, parquetFileTarget);
    pipeline.run();

    Person person = genericCollection.materialize().iterator().next();

    PCollection<Person> retrievedPeople = pipeline.read(new AvroParquetFileSource<Person>(
        new Path(outputFile.toURI()), Avros.records(Person.class)));

    Person retrievedPerson = retrievedPeople.materialize().iterator().next();

    assertThat(retrievedPerson, is(person));
View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(new AvroParquetFileSource<Person>(new Path(avroFile.getAbsolutePath()),
        Avros.records(Person.class)));

    List<Person> personList = Lists.newArrayList(genericCollection.materialize());

    Person expectedPerson = new Person();
View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), genericPersonSchema);

    Pipeline pipeline = new MRPipeline(AvroParquetFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Record> genericCollection = pipeline.read(new AvroParquetFileSource<Record>(new Path
        (avroFile.getAbsolutePath()),
        Avros.generics(genericPersonSchema)));

    List<Record> recordList = Lists.newArrayList(genericCollection.materialize());
View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(
        AvroParquetFileSource.builder(Person.class)
            .includeField("age")
            .build(new Path(avroFile.getAbsolutePath())));

    File outputFile = tmpDir.getFile("output");
    Target avroFile = To.avroFile(outputFile.getAbsolutePath());
    genericCollection.write(avroFile);
    pipeline.done();
   
    Pipeline pipeline2 = new MRPipeline(AvroParquetFileSourceTargetIT.class,
        tmpDir.getDefaultConfiguration());
    PCollection<Person> ageOnly = pipeline2.read(
        new AvroFileSource<Person>(new Path(outputFile.getAbsolutePath()), Avros.specifics(Person.class)));

    for (Person person : ageOnly.materialize()) {
      assertNull(person.getName());
      assertEquals(person.getAge(), new Integer(42));
View Full Code Here

    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    AvroParquetFileSource<GenericRecord> src = AvroParquetFileSource.builder(Person.SCHEMA$)
        .includeField("age")
        .build(new Path(avroFile.getAbsolutePath()));
    Pipeline pipeline = new MRPipeline(AvroParquetFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<GenericRecord> genericCollection = pipeline.read(src);

    File outputFile = tmpDir.getFile("output");
    Target avroFile = To.avroFile(outputFile.getAbsolutePath());
    genericCollection.write(avroFile);
    pipeline.done();

    Pipeline pipeline2 = new MRPipeline(AvroParquetFileSourceTargetIT.class,
        tmpDir.getDefaultConfiguration());
    PCollection<Record> ageOnly = pipeline2.read(
        new AvroFileSource<Record>(new Path(outputFile.getAbsolutePath()), Avros.generics(src.getProjectedSchema())));

    for (Record person : ageOnly.materialize()) {
      assertEquals(person.get(0), 42);
      Object notAge = person.get(1);
View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(
        AvroParquetFileSource.builder(Person.class)
            .includeField("age")
            .filterClass(RejectAllFilter.class)
            .build(new Path(avroFile.getAbsolutePath())));

    File outputFile = tmpDir.getFile("output");
    Target avroFile = To.avroFile(outputFile.getAbsolutePath());
    genericCollection.filter(new FilterFn<Person>() {
      @Override
      public boolean accept(Person input) {
        return input != null;
      }
    }).write(avroFile);
    pipeline.done();

    Pipeline pipeline2 = new MRPipeline(AvroParquetFileSourceTargetIT.class,
        tmpDir.getDefaultConfiguration());
    PCollection<Person> ageOnly = pipeline2.read(
        new AvroFileSource<Person>(new Path(outputFile.getAbsolutePath()), Avros.specifics(Person.class)));
    assertTrue(Lists.newArrayList(ageOnly.materialize()).isEmpty());
  }
View Full Code Here

  }

  @Test
  public void testMemPipelineFileWriter() throws Exception {
    File tmpDir = baseTmpDir.getFile("mempipe");
    Pipeline p = MemPipeline.getInstance();
    PCollection<String> lines = MemPipeline.collectionOf("hello", "world");
    p.writeTextFile(lines, tmpDir.toString());
    p.done();
    assertTrue(tmpDir.exists());
    File[] files = tmpDir.listFiles();
    assertTrue(files != null && files.length > 0);
    for (File f : files) {
      if (!f.getName().startsWith(".")) {
View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);

    Pipeline pipeline = new MRPipeline(AvroParquetFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(new AvroParquetFileSource<Person>(new Path(avroFile.getAbsolutePath()),
        Avros.records(Person.class)));

    List<Person> personList = Lists.newArrayList(genericCollection.materialize());

    Person expectedPerson = new Person();
View Full Code Here

TOP

Related Classes of org.apache.crunch.Pipeline

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.