Package org.apache.crunch

Examples of org.apache.crunch.PCollection


      throw new IllegalArgumentException("Unsupported --pipeline-type: " + opts.pipelineType);
    }

    this.tmpFile = null;   
    try {
      PCollection collection = extractInputCollection(opts, mappers, pipeline);
      if (collection == null) {
        return 0;
      }

      String morphlineFileContents = Files.toString(opts.morphlineFile, Charsets.UTF_8);
      Map<String, String> morphlineVariables = new HashMap<String, String>();
      for (Map.Entry<String, String> entry : pipeline.getConfiguration()) {
        String variablePrefix = MORPHLINE_VARIABLE_PARAM + ".";
        if (entry.getKey().startsWith(variablePrefix)) {
          morphlineVariables.put(entry.getKey().substring(variablePrefix.length()), entry.getValue());
        }
      }
     
      Map<String, Object> settings = new HashMap<String, Object>();
      settings.put(TypedSettings.DRY_RUN_SETTING_NAME, opts.isDryRun);
     
      DoFn morphlineFn = new MorphlineFn(
          morphlineFileContents,
          opts.morphlineId,
          morphlineVariables,
          settings,
          opts.inputFileFormat != null
          );
      collection = collection.parallelDo(
          "morphline",
          morphlineFn,
          Avros.nulls() // trick to enable morphline to emit any kind of output data, including non-avro data
          );
     
      collection = collection.parallelDo(
          FilterFns.REJECT_ALL(), // aka dropRecord
          Avros.nulls() // trick to enable morphline to emit any kind of output data, including non-avro data
          );
 
      writeOutput(opts, pipeline, collection);
View Full Code Here


        // If there are few input files reduce latency by directly running main memory randomization
        // instead of launching a high latency MapReduce job
        randomizeFewInputFiles(tmpFs, tmpFile);
      }
      PCollection collection = pipeline.read(new NLineFileSource<String>(tmpFile, Writables.strings(), numLinesPerSplit));

      if (!randomizeFewInputFiles) {
        collection = randomize(collection); // uses a high latency MapReduce job
      }
      collection = collection.parallelDo(new HeartbeatFn(), collection.getPType());
      return collection;
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.crunch.PCollection

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.