Examples of org.apache.crunch.Pipeline

org.apache.crunch.Pipeline
Manages the state of a pipeline execution.

  @Test
  public void testVanillaCSV() throws Exception {
    String[] expectedFileContents = { "1,2,3,4", "5,6,7,8", "9,10,11", "12,13,14" };


    String vanillaCSVFile = tmpDir.copyResourceFileName("vanilla.csv");
    Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(vanillaCSVFile)));
    pipeline.run();


    Collection<String> csvLinesList = csvLines.asCollection().getValue();


    for (int i = 0; i < expectedFileContents.length; i++) {
      assertTrue(csvLinesList.contains(expectedFileContents[i]));

View Full Code Here

    String[] expectedFileContents = {
        "\"Champion, Mac\",\"1234 Hoth St.\n\tApartment 101\n\tAtlanta, GA\n\t64086\",\"30\",\"M\",\"5/28/2010 12:00:00 AM\",\"Just some guy\"",
        "\"Champion, Mac\",\"5678 Tatooine Rd. Apt 5, Mobile, AL 36608\",\"30\",\"M\",\"Some other date\",\"short description\"" };


    String csvWithNewlines = tmpDir.copyResourceFileName("withNewlines.csv");
    Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(csvWithNewlines)));
    pipeline.run();


    Collection<String> csvLinesList = csvLines.asCollection().getValue();


    for (int i = 0; i < expectedFileContents.length; i++) {
      assertTrue(csvLinesList.contains(expectedFileContents[i]));

View Full Code Here

    String[] expectedFileContents = {
        "*Champion, Mac*,*1234 Hoth St.\n\tApartment 101\n\tAtlanta, GA\n\t64086*,*30*,*M*,*5/28/2010 12:00:00 AM*,*Just some guy*",
        "*Mac, Champion*,*5678 Tatooine Rd. Apt 5, Mobile, AL 36608*,*30*,*M*,*Some other date*,*short description*" };


    String csvWithNewlines = tmpDir.copyResourceFileName("customQuoteCharWithNewlines.csv");
    Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(csvWithNewlines),
        CSVLineReader.DEFAULT_BUFFER_SIZE, CSVLineReader.DEFAULT_INPUT_FILE_ENCODING, '*', '*',
        CSVLineReader.DEFAULT_ESCAPE_CHARACTER));
    pipeline.run();


    Collection<String> csvLinesList = csvLines.asCollection().getValue();


    for (int i = 0; i < expectedFileContents.length; i++) {
      assertTrue(csvLinesList.contains(expectedFileContents[i]));

View Full Code Here

  public void testBrokenLineParsingInChinese() throws IOException {
    final String[] expectedChineseLines = { "您好我叫马克，我从亚拉巴马州来，我是软件工程师，我二十八岁", "我有一个宠物，它是一个小猫，它六岁，它很漂亮",
        "我喜欢吃饭，“我觉得这个饭最好\n＊蛋糕\n＊包子\n＊冰淇淋\n＊啤酒“，他们都很好，我也很喜欢奶酪但它是不健康的", "我是男的，我的头发很短，我穿蓝色的裤子，“我穿黑色的、“衣服”" };
    String chineseLines = tmpDir.copyResourceFileName("brokenChineseLines.csv");


    Pipeline pipeline = new MRPipeline(CSVFileSourceIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> csvLines = pipeline.read(new CSVFileSource(new Path(chineseLines),
        CSVLineReader.DEFAULT_BUFFER_SIZE, CSVLineReader.DEFAULT_INPUT_FILE_ENCODING, '“', '”', '、'));
    pipeline.run();
    Collection<String> csvLinesList = csvLines.asCollection().getValue();
    for (int i = 0; i < expectedChineseLines.length; i++) {
      assertTrue(csvLinesList.contains(expectedChineseLines[i]));
    }
  }

View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);


    Pipeline pipeline = new MRPipeline(TrevniKeyPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.records(Person.class)));
    File outputFile = tmpDir.getFile("output");
    Target trevniFile = new TrevniKeyTarget(outputFile.getAbsolutePath());
    pipeline.write(genericCollection, trevniFile);
    pipeline.run();


    Person person = genericCollection.materialize().iterator().next();


    PCollection<Person> retrievedPeople = pipeline.read(new TrevniKeySource<Person>(
        new Path(outputFile.toURI()), Avros.records(Person.class)));


    Person retrievedPerson = retrievedPeople.materialize().iterator().next();


    assertThat(retrievedPerson, is(person));

View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);


    Pipeline pipeline = new MRPipeline(TrevniKeyPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.records(Person.class)));
    File outputFile = tmpDir.getFile("output");
    Target trevniFile = new TrevniKeyTarget(outputFile.getAbsolutePath());
    pipeline.write(genericCollection, trevniFile);
    pipeline.run();


    Person person = genericCollection.materialize().iterator().next();


    File trvFile = new File(outputFile, "part-m-00000-part-0.trv");

View Full Code Here

    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);


    Pipeline pipeline = new MRPipeline(TrevniKeyPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
        Avros.records(Person.class)));
    File output1File = tmpDir.getFile("output1");
    File output2File = tmpDir.getFile("output2");
    pipeline.write(genericCollection, new TrevniKeyTarget(output1File.getAbsolutePath()));
    pipeline.write(genericCollection, new TrevniKeyTarget(output2File.getAbsolutePath()));
    pipeline.run();


    Person person = genericCollection.materialize().iterator().next();


    File trv1File = new File(output1File, "part-m-00000-part-0.trv");
    File trv2File = new File(output2File, "part-m-00000-part-0.trv");

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Aggregator used for summing up response size and count
    Aggregator<Pair<Long, Long>> agg = pairAggregator(SUM_LONGS(), SUM_LONGS());


    // Table of (ip, sum(response size), count)
    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
        .parallelDo(extractResponseSize,
            Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
        .combineValues(agg);


    // Calculate average response size by ip address
    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
        Writables.tableOf(Writables.strings(), Writables.doubles()));


    // write the result to a text file
    pipeline.writeTextFile(avgs, args[1]);
    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

      System.err.println();
      GenericOptionsParser.printGenericCommandUsage(System.err);
      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalWordCount.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Define a function that splits each line in a PCollection of Strings into
    // a
    // PCollection made up of the individual words in the file.
    PCollection<Long> numberOfWords = lines.parallelDo(new DoFn<String, Long>() {
      public void process(String line, Emitter<Long> emitter) {
        emitter.emit((long)line.split("\\s+").length);
      }
    }, Writables.longs()); // Indicates the serialization format


    // The aggregate method groups a collection into a single PObject.
    PObject<Long> totalCount = numberOfWords.aggregate(Aggregators.SUM_LONGS()).first();


    // Execute the pipeline as a MapReduce.
    PipelineResult result = pipeline.run();


    System.out.println("Total number of words: " + totalCount.getValue());
    
    pipeline.done();


    return result.succeeded() ? 0 : 1;
  }

View Full Code Here

    createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));


    putInHbase(putList, configuration);


    // We create the pipeline which will handle most of the job.
    Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());


    // The scan which will retrieve the data from the source in hbase.
    Scan scan = new Scan();
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);


    // Our hbase source
    HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);


    // Our source, in a format which can be use by crunch
    PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);


    // We process the data from the source HTable then concatenate all data
    // with the same rowkey
    PTable<String, String> textExtracted = extractText(rawText);
    PTable<String, String> result = textExtracted.groupByKey()
        .combineValues(Aggregators.STRING_CONCAT(" ",  true));


    // We create the collection of puts from the concatenated datas
    PCollection<Put> resultPut = createPut(result);


    // We write the puts in hbase, in the target table
    pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));


    pipeline.done();
    return 0;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.crunch.Pipeline

$.WordCount

com.cloudera.cdk.data.crunch.TestCrunchDatasets

org.apache.bigtop.bigpetstore.etl.CrunchETL

org.apache.crunch.contrib.io.jdbc.DataBaseSourceIT

org.apache.crunch.examples.AverageBytesByIP

org.apache.crunch.examples.SecondarySortExample

org.apache.crunch.examples.TotalBytesByIP

org.apache.crunch.examples.TotalWordCount

org.apache.crunch.examples.WordAggregationHBase

org.apache.crunch.examples.WordCount

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.