Examples of org.apache.hadoop.mapred.JobClient

org.apache.hadoop.mapred.JobClient
te a new JobConf JobConf job = new JobConf(new Configuration(), MyJob.class); // Specify various job-specific parameters job.setJobName("myjob"); job.setInputPath(new Path("in")); job.setOutputPath(new Path("out")); job.setMapperClass(MyJob.MyMapper.class); job.setReducerClass(MyJob.MyReducer.class); // Submit the job, then poll for progress until the job is complete JobClient.runJob(job);

Job Control

At times clients would chain map-reduce jobs to accomplish complex tasks which cannot be done via a single map-reduce job. This is fairly easy since the output of the job, typically, goes to distributed file-system and that can be used as the input for the next job.

However, this also means that the onus on ensuring jobs are complete (success/failure) lies squarely on the clients. In such situations the various job-control options are:
1. {@link #runJob(JobConf)} : submits the job and returns only after the job has completed.
2. {@link #submitJob(JobConf)} : only submits the job, then poll the returned handle to the {@link RunningJob} to query status and make scheduling decisions.
3. {@link JobConf#setJobEndNotificationURI(String)} : setup a notificationon job-completion, thus avoiding polling.
@see JobConf @see ClusterStatus @see Tool @see DistributedCache

      throws IOException {
    int numMaps =
      (int)(totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP));
    numMaps = Math.min(numMaps, 
        job.getInt(MAX_MAPS_LABEL, MAX_MAPS_PER_NODE *
          new JobClient(job).getClusterStatus().getTaskTrackers()));
    job.setNumMapTasks(Math.max(numMaps, 1));
  }

View Full Code Here

        args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname,
        args.flags.contains(Options.PRESERVE_STATUS));


    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path stagingArea;
    try {
      stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
    
    Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId);
    FsPermission mapredSysPerms =
      new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jClient.getFs(), jobDirectory, mapredSysPerms);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());


    long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP);


    FileSystem dstfs = args.dst.getFileSystem(conf);

View Full Code Here

    this.state = Job.WAITING;
    this.jobID = "unassigned";
    this.mapredJobID = null; //not yet assigned 
    this.jobName = jobConf.getJobName();
    this.message = "just initialized";
    this.jc = new JobClient(jobConf);
  }

View Full Code Here

    if (args.length == 0) {
      return printUsage();    
    }
    
    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                             1*1024*1024*1024);
    if (numBytesToWritePerMap == 0) {
      System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");

View Full Code Here

                                  String modelPrototype,
                                  int prototypeSize,
                                  int numClusters,
                                  double alpha_0,
                                  int numReducers) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DirichletDriver.class);
    
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DirichletCluster.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(stateOut);
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setMapperClass(DirichletMapper.class);
    conf.setReducerClass(DirichletReducer.class);
    conf.setNumReduceTasks(numReducers);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set(STATE_IN_KEY, stateIn);
    conf.set(MODEL_FACTORY_KEY, modelFactory);
    conf.set(MODEL_PROTOTYPE_KEY, modelPrototype);
    conf.set(PROTOTYPE_SIZE_KEY, Integer.toString(prototypeSize));
    conf.set(NUM_CLUSTERS_KEY, Integer.toString(numClusters));
    conf.set(ALPHA_0_KEY, Double.toString(alpha_0));
    
    client.setConf(conf);
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      log.warn(e.toString(), e);
    }

View Full Code Here

   *          the directory pathname for input state
   * @param output
   *          the directory pathname for output points
   */
  public static void runClustering(String input, String stateIn, String output) {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DirichletDriver.class);
    
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setMapperClass(DirichletMapper.class);
    conf.setNumReduceTasks(0);
    
    client.setConf(conf);
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      log.warn(e.toString(), e);
    }

View Full Code Here

   */
  public static void runJob(String input, String output,
                            String measureClassName, double t1, double t2) throws IOException {
    log.info("Input: {} Out: {} " 
      + "Measure: {} t1: {} t2: {}", new Object[] {input, output, measureClassName, t1, t2});
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CanopyDriver.class);
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
    
    conf.setInputFormat(SequenceFileInputFormat.class);
    
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Canopy.class);
    
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setMapperClass(CanopyMapper.class);
    conf.setReducerClass(CanopyReducer.class);
    conf.setNumReduceTasks(1);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    
    client.setConf(conf);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    JobClient.runJob(conf);

View Full Code Here

                            String canopies,
                            String output,
                            String measureClassName,
                            double t1,
                            double t2) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(ClusterDriver.class);
    
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
    conf.set(CanopyConfigKeys.CANOPY_PATH_KEY, canopies);
    
    conf.setInputFormat(SequenceFileInputFormat.class);
    
    /*
     * conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(RandomAccessSparseVector.class);
     */
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(VectorWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    
    FileInputFormat.setInputPaths(conf, new Path(points));
    Path outPath = new Path(output + DEFAULT_CLUSTER_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setMapperClass(ClusterMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    
    client.setConf(conf);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    JobClient.runJob(conf);

View Full Code Here

   * @param all
   *          if true select all categories
   */
  public static void runJob(String input, String output, String catFile,
                            boolean exactMatchOnly, boolean all) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToSequenceFile.class);
    if (WikipediaToSequenceFile.log.isInfoEnabled()) {
      log.info("Input: " + input + " Out: " + output + " Categories: " + catFile
                                       + " All Files: " + all);
    }
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(WikipediaMapper.class);
    conf.setInputFormat(XmlInputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    
    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    
    Set<String> categories = new HashSet<String>();
    if (catFile.length() > 0) {
      for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase());
      }
    }
    
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil
        .getClass(categories));
    
    String categoriesStr = setStringifier.toString(categories);
    
    conf.set("wikipedia.categories", categoriesStr);
    
    client.setConf(conf);
    JobClient.runJob(conf);
  }

View Full Code Here

  public static void runJob(String input,
                            String output,
                            String catFile,
                            boolean exactMatchOnly,
                            Class<? extends Analyzer> analyzerClass) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.class);
    if (WikipediaDatasetCreatorDriver.log.isInfoEnabled()) {
      log.info("Input: {} Out: {} Categories: {}", new Object[] {input, output,
                                                                                               catFile});
    }
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
    conf.set("xmlinput.end", "</text>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(WikipediaDatasetCreatorMapper.class);
    conf.setNumMapTasks(100);
    conf.setInputFormat(XmlInputFormat.class);
    // conf.setCombinerClass(WikipediaDatasetCreatorReducer.class);
    conf.setReducerClass(WikipediaDatasetCreatorReducer.class);
    conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.class);
    conf
        .set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    
    Set<String> categories = new HashSet<String>();
    for (String line : new FileLineIterable(new File(catFile))) {
      categories.add(line.trim().toLowerCase());
    }
    
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil
        .getClass(categories));
    
    String categoriesStr = setStringifier.toString(categories);
    
    conf.set("wikipedia.categories", categoriesStr);
    
    client.setConf(conf);
    JobClient.runJob(conf);
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.mapred.JobClient

com.atlantbh.jmeter.plugins.hadooputilities.jobstatistics.JobLayer

com.atlantbh.jmeter.plugins.hadooputilities.jobstatistics.TaskLayer

com.cloudera.circus.test.TestXTest

com.cloudera.lib.service.hadoop.HadoopService

com.cloudera.lib.service.hadoop.TestHadoopService

com.fb2pdf.hadoop.cluster.MeanShiftCanopyDriver

com.ikanow.infinit.e.processing.custom.CustomProcessingController

com.linkedin.camus.etl.kafka.CamusJob

com.netflix.lipstick.pigtolipstick.BasicP2LClient

com.netflix.lipstick.warnings.JobWarnings

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.

Examples of org.apache.hadoop.mapred.JobClient

Job Control

Related Classes of org.apache.hadoop.mapred.JobClient