Examples of org.apache.hadoop.mapreduce.Job

org.apache.hadoop.mapreduce.Job
The job submitter's view of the Job. It allows the user to configure the job, submit it, control its execution, and query the state. The set methods only work until the job is submitted, afterwards they will throw an IllegalStateException.

      boolean usedthedate,
      String custFields,
      updateStatus update,String uniqCheckField,Integer parallel 
      ) throws Exception
  {
    Job job = new Job(new Configuration(jconf));
    JobIndexPublic.setJars(job.getConfiguration());
    if (filetype.equals("seq")) {
      job.setInputFormatClass(SequenceFileInputFormat.class);
      for (String input : inputs) {
        Path p = new Path(inputBase, "*" + input + "*/"+inputmatch+"");
        System.out.println(p.toString());
        FileStatus[] list=fs.globStatus(p);
        if(list==null||list.length==0)
        {
          continue;
        }
        SequenceFileInputFormat.addInputPath(job, p);
      }
    } else {
      for (String input : inputs) {
        Path p = new Path(inputBase, "*" + input + "*/"+inputmatch+"");
        System.out.println(p.toString());
        FileStatus[] list=fs.globStatus(p);
        if(list==null||list.length==0)
        {
          continue;
        }
        FileInputFormat.addInputPath(job, p);
      }
    }
    
    Path baseP=new Path(output);
    Path baseParent=new Path(output);
    if(baseP.getParent()!=null&&baseP.getParent().getParent()!=null)
    {
      baseParent=baseP.getParent().getParent();
    }


    String jobnameOutput=new String(baseParent.toString()+"*"+baseP.getName());
    int cutoutlen=50;
    if(jobnameOutput.length()>cutoutlen)
    {
      jobnameOutput="*"+jobnameOutput.substring(jobnameOutput.length()-cutoutlen, jobnameOutput.length());
    }
    
    System.out.println("output:"+output+"@"+jobnameOutput);
    System.out.println("tmp:"+smallindex.toString());
    job.setJobName("mdrill_stage_1@"+jobnameOutput);
    job.setJarByClass(JobIndexerPartion.class);


    fs.delete(new Path(output), true);
    fs.delete(smallindex, true);
    Configuration conf = job.getConfiguration();


    String fields = JobIndexPublic.readFieldsFromSchemaXml(solrHome+ "/solr/conf/schema.xml",fs,conf);
    JobIndexPublic.setDistributecache(new Path(solrHome,"solr/conf"), fs,conf);
    if(!split.isEmpty()&&!split.equals("default")&&!split.equals("\001"))
    {
      conf.set("higo.column.split", split);
    }
    
    conf.set("uniq.check.field", uniqCheckField);
    
    if(split.equals("\t"))
    {
      conf.set("higo.column.split", "tab");
    }
    conf.set("higo.column.custfields", custFields);
    conf.set("higo.input.base", inputBase);
    
    
    conf.setBoolean("higo.column.userthedate", usedthedate);
    //conf.set("mapred.reduce.slowstart.completed.maps", "0.01");
    conf.set("higo.index.fields", fields);
    job.setPartitionerClass(PairPartion.class) ;


    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(PairWriteable.class);
    job.setMapOutputValueClass(DocumentMap.class);
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, smallindex);
    job.setNumReduceTasks(shards * parallel);
    int result=0;


    if(update!=null)
    {
          job.submit();
          while(!job.isComplete())
          {
            update.update(1, job);
            Thread.sleep(3000);
          }
          if(update.dump(job)){
            return -1;
          }
    }else{
      result=job.waitForCompletion(true)? 0 : -1;
    }
        
        


    if(result==0)
    {
      Job job2 = new Job(new Configuration(jconf));
      JobIndexPublic.setJars(job2.getConfiguration());
      job2.setJobName("mdrill_stage_2@"+jobnameOutput);
      Configuration conf2 = job2.getConfiguration();
      JobIndexPublic.setDistributecache(new Path(solrHome,"solr/conf"), fs,conf2);
      conf2.set("higo.index.fields", fields);
      job2.setJarByClass(JobIndexerPartion.class);
      job2.setInputFormatClass(SequenceFileInputFormat.class);
      SequenceFileInputFormat.addInputPath(job2, new Path(smallindex,"part-r-*"));
      job2.setMapOutputKeyClass(IntWritable.class);
      job2.setMapOutputValueClass(Text.class);
      job2.setPartitionerClass(IntPartion.class) ;
      job2.setReducerClass(IndexReducerMerge.class);
      job2.setOutputKeyClass(IntWritable.class);
      job2.setOutputValueClass(Text.class);
      job2.setOutputFormatClass(SequenceFileOutputFormat.class);
      job2.setNumReduceTasks(shards);
      SequenceFileOutputFormat.setOutputPath(job2, new Path(output));
      if(update!=null)
      {
        job2.submit();
            while(!job2.isComplete())
            {
              update.update(2, job2);
              Thread.sleep(3000);
            }
            
            update.finish();
      }else{
        result= job2.waitForCompletion(true) ? 0 : -1;
  
      }
    }
    
    fs.delete(smallindex, true);

View Full Code Here

    FileSystem fs=FileSystem.get(conf);
    conf.set(CLUSTER_PATH_KEY, clustersIn.toString());
    conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta);
    conf.setInt(CLUSTER_CONVERGENCE_ABTEST_REP, rep);


    Job job = new Job(conf,
        "KMeans Driver running clusterData over input: " + input);
//    job.setInputFormatClass(FileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    
    


    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);


    job.setMapperClass(KMeansClusterMapper.class);
    job.setCombinerClass(KMeansClusterCombiner.class);
    job.setReducerClass(KMeansClusterReduce.class);
    job.setNumReduceTasks(this.reduce);


    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output, "cluster_abtest"));


    job.setJarByClass(KMeansDriver.class);


    if (!job.waitForCompletion(true)) {
      throw new InterruptedException(
          "K-Means Clustering failed processing " + clustersIn);
    }
  }

View Full Code Here


    conf.set(CLUSTER_PATH_KEY, clustersIn.toString());
    conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta);
    FileSystem fs=FileSystem.get(conf);


    Job job = new Job(conf);
    job.setJobName(  "KMeans Driver running runIteration over clustersIn: "
        + clustersIn);


    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Vector.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Cluster.class);


//    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(KMeansMapper.class);
    job.setCombinerClass(KMeansCombiner.class);
    job.setReducerClass(KMeansReducer.class);


    FileInputFormat.addInputPath(job, input);
    SequenceFileOutputFormat.setOutputPath(job, clustersOut);


    job.setNumReduceTasks(this.reduce);
    job.setJarByClass(KMeansDriver.class);
//    HadoopUtil.delete(conf, clustersOut);
    if (!job.waitForCompletion(true)) {
      throw new InterruptedException(
          "K-Means Iteration failed processing " + clustersIn);
    }


    return isConverged(clustersOut, conf, fs);

View Full Code Here

  
  private Path InitCenter(Configuration conf, Path input, Path output, int k)
      throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    Path outFile = new Path(output, "part-InitCenter");
    Job job = new Job(conf);
    job.setJobName(  "KMeans Driver: "+ outFile);


    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Cluster.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Cluster.class);


    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(KMeansGroupMapper.class);
    job.setCombinerClass(KMeansGroupCombine.class);
    job.setReducerClass(KMeansGroupReducer.class);


    FileInputFormat.addInputPath(job, input);
    SequenceFileOutputFormat.setOutputPath(job, outFile);


    job.setNumReduceTasks(32);
    job.setJarByClass(KMeansDriver.class);
//    HadoopUtil.delete(conf, clustersOut);
    if (!job.waitForCompletion(true)) {
      throw new InterruptedException(
          "K-Means Iteration failed processing " + outFile);
    }
    return outFile;

View Full Code Here

  }


  /** {@inheritDoc} */
  public InputSplit[] getSplits(JobConf job, int chunks) throws IOException {
    List<org.apache.hadoop.mapreduce.InputSplit> newSplits = 
      super.getSplits(new Job(job));
    InputSplit[] ret = new InputSplit[newSplits.size()];
    int i = 0;
    for (org.apache.hadoop.mapreduce.InputSplit s : newSplits) {
      org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split = 
      (org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit)s;

View Full Code Here

      Path file2 = new Path(dir2 + "/file2");
      writeFile(conf, file2, (short)2, 2);


      // split it using a CombinedFile input format
      DummyInputFormat inFormat = new DummyInputFormat();
      Job job = new Job(conf);
      FileInputFormat.setInputPaths(job, dir1 + "," + dir2);
      inFormat.setMinSplitSizeRack(BLOCKSIZE);
      List<InputSplit> splits = inFormat.getSplits(job);
      System.out.println("Made splits(Test1): " + splits.size());

View Full Code Here

  /*
   * Prints out the input splits for the specified files
   */
  private void splitRealFiles(String[] args) throws IOException {
    Configuration conf = new Configuration();
    Job job = new Job();
    FileSystem fs = FileSystem.get(conf);
    if (!(fs instanceof DistributedFileSystem)) {
      throw new IOException("Wrong file system: " + fs.getClass().getName());
    }
    int blockSize = conf.getInt("dfs.block.size", 128 * 1024 * 1024);

View Full Code Here


    String userName = getJobOwnerName();
    File workDir = new File(new Path(TEST_ROOT_DIR, "workdir").toString());


    // Configures a job with a regular file
    Job job1 = new Job(conf);
    Configuration conf1 = job1.getConfiguration();
    conf1.set("user.name", userName);
    DistributedCache.addCacheFile(secondCacheFile.toUri(), conf1);
    
    TrackerDistributedCacheManager.determineTimestamps(conf1);
    TrackerDistributedCacheManager.determineCacheVisibilities(conf1);


    // Task localizing for first job
    TaskDistributedCacheManager handle = manager
        .newTaskDistributedCacheManager(conf1);
    handle.setup(localDirAllocator, workDir, TaskTracker
          .getPrivateDistributedCacheDir(userName), 
          TaskTracker.getPublicDistributedCacheDir());
    handle.release();
    for (TaskDistributedCacheManager.CacheFile c : handle.getCacheFiles()) {
      assertEquals(0, manager.getReferenceCount(c.uri, conf1, c.timestamp, 
          c.owner));
    }
    
    Path thirdCacheFile = new Path(TEST_ROOT_DIR, "thirdcachefile");
    createPrivateTempFile(thirdCacheFile);
    
    // Configures another job with three regular files.
    Job job2 = new Job(conf);
    Configuration conf2 = job2.getConfiguration();
    conf2.set("user.name", userName);
    // add a file that would get failed to localize
    DistributedCache.addCacheFile(firstCacheFile.toUri(), conf2);
    // add a file that is already localized by different job
    DistributedCache.addCacheFile(secondCacheFile.toUri(), conf2);

View Full Code Here

        List<String> nodes=Lists.newArrayList();
        for(String link:options.type)
            nodes.add("<"+link+">");
        getConf().set(EntityIsAReducer.TYPE_LIST, Joiner.on(",").join(nodes));


        Job job=new Job(getConf(),"extractIsA");
        job.setJarByClass(this.getClass());
        job.setMapperClass(EntityCentricMapper.class);
        job.setReducerClass(EntityIsAReducer.class);


        job.setNumReduceTasks(options.reducerCount);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);


        for(String path: options.input) {
            FileInputFormat.addInputPath(job, new Path(path));
        }


        FileOutputFormat.setOutputPath(job, new Path(options.output));
        FileOutputFormat.setCompressOutput(job,true);
        FileOutputFormat.setOutputCompressorClass(job,GzipCodec.class);
        job.setOutputFormatClass(TextOutputFormat.class);


        return job.waitForCompletion(true) ? 0 : 1;
    }

View Full Code Here

            conf.set("mapred.compress.map.output", "true");
            conf.set("mapred.output.compression.type", "BLOCK"); 
            conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");


            conf.set(RanSampleMapper.NULL_VALUE, Boolean.toString((reduceTasks==null || reduceTasks==0)));
            Job job=new Job(conf,"ranSample");
            FileInputFormat.addInputPath(job, input);
            
            job.setJarByClass(RanSampleTool.class);
            job.setMapperClass(RanSampleMapper.class);


            FileOutputFormat.setOutputPath(job, output);
            FileOutputFormat.setCompressOutput(job,true);
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
            job.setOutputFormatClass(TextOutputFormat.class);


            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);
            if(reduceTasks==null) {
                job.setNumReduceTasks(0);
            } else {
                job.setNumReduceTasks(reduceTasks);
                job.setReducerClass(PassthroughReducer.class);
            }
            return job.waitForCompletion(true) ? 0 :1;
        } catch(Main.IncorrectUsageException iue) {
            return 2;
        }
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.mapreduce.Job

bulkimport.BulkImportJobExample

co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService

com.alimama.quanjingmonitor.kmeans.KMeansDriver

com.asakusafw.runtime.stage.AbstractStageClient

com.asakusafw.thundergate.runtime.cache.mapreduce.CacheBuildClient

com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs

com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat

com.datasalt.pangool.tuplemr.mapred.TestRollup

com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob

com.hadoop.compression.lzo.DistributedLzoIndexer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.