Examples of PCollection


Examples of com.tdunning.plume.PCollection

    @Override
    public void build() {
      init();
     
      LazyPlume plume = new LazyPlume();
      PCollection input;
      try {
        // Read input
        input = plume.readFile("/tmp/input-wordcount.txt", collectionOf(strings()));
        // Add it as workflow's input
        addInput(input);
      } catch (IOException e) {
        throw new RuntimeException();
      }
     
      PCollection bypassTransform = input.map(new DoFn() {
        @Override
        public void process(Object v, EmitFn emitter) {
          Text t = (Text)v;
          emitter.emit(Pair.create(new Text(t + "-blah"), new Text(t + "-bloh")));
        }}, tableOf(strings(), strings()));
     
      addOutput(bypassTransform);
     
      PCollection groupedTransform = input.map(new DoFn() {
        @Override
        public void process(Object v, EmitFn emitter) {
          Text t = (Text)v;
          emitter.emit(Pair.create(t, new Text("foo")));
        }}, tableOf(strings(), strings())).groupByKey();
View Full Code Here

Examples of com.tdunning.plume.PCollection

    public void build() {
      init();

      LazyPlume plume = new LazyPlume();
      // Get input files
      PCollection inputEvent2, inputLogFile, inputLogFile2;
      try {
        inputEvent2   = plume.readFile(inputPathEvent2, collectionOf(strings()));
        inputLogFile2 = plume.readFile(inputPathLogFile2, collectionOf(strings()));
        inputLogFile  = plume.readFile(inputPathLogFile, collectionOf(strings()));
        // Add as inputs
        addInput(inputEvent2);
        addInput(inputLogFile);
      } catch (IOException e) {
        throw new RuntimeException();
      }     
     
      /**
       * Emit the user of the log file - flatten it with users file
       */
      PCollection output = plume.flatten(collectionOf(strings()),
        inputEvent2,
        inputLogFile.map(new DoFn<Text, Text>() {
          @Override
          public void process(Text v, EmitFn<Text> emitter) {
            String[] splittedLine = v.toString().split("\t");
            emitter.emit(new Text(splittedLine[2]));
          }
        }, collectionOf(strings())));
     
      /**
       * Flatten two log files
       */
      PCollection output2 = plume.flatten(collectionOf(strings()),
        inputLogFile2,
        inputLogFile);
     
      addOutput(output);
      addOutput(output2);
View Full Code Here

Examples of org.apache.crunch.PCollection

      throw new IllegalArgumentException("Unsupported --pipeline-type: " + opts.pipelineType);
    }

    this.tmpFile = null;   
    try {
      PCollection collection = extractInputCollection(opts, mappers, pipeline);
      if (collection == null) {
        return 0;
      }

      String morphlineFileContents = Files.toString(opts.morphlineFile, Charsets.UTF_8);
      Map<String, String> morphlineVariables = new HashMap<String, String>();
      for (Map.Entry<String, String> entry : pipeline.getConfiguration()) {
        String variablePrefix = MORPHLINE_VARIABLE_PARAM + ".";
        if (entry.getKey().startsWith(variablePrefix)) {
          morphlineVariables.put(entry.getKey().substring(variablePrefix.length()), entry.getValue());
        }
      }
     
      Map<String, Object> settings = new HashMap<String, Object>();
      settings.put(TypedSettings.DRY_RUN_SETTING_NAME, opts.isDryRun);
     
      DoFn morphlineFn = new MorphlineFn(
          morphlineFileContents,
          opts.morphlineId,
          morphlineVariables,
          settings,
          opts.inputFileFormat != null
          );
      collection = collection.parallelDo(
          "morphline",
          morphlineFn,
          Avros.nulls() // trick to enable morphline to emit any kind of output data, including non-avro data
          );
     
      collection = collection.parallelDo(
          FilterFns.REJECT_ALL(), // aka dropRecord
          Avros.nulls() // trick to enable morphline to emit any kind of output data, including non-avro data
          );
 
      writeOutput(opts, pipeline, collection);
View Full Code Here

Examples of org.apache.crunch.PCollection

        // If there are few input files reduce latency by directly running main memory randomization
        // instead of launching a high latency MapReduce job
        randomizeFewInputFiles(tmpFs, tmpFile);
      }
      PCollection collection = pipeline.read(new NLineFileSource<String>(tmpFile, Writables.strings(), numLinesPerSplit));

      if (!randomizeFewInputFiles) {
        collection = randomize(collection); // uses a high latency MapReduce job
      }
      collection = collection.parallelDo(new HeartbeatFn(), collection.getPType());
      return collection;
    }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.