Examples of org.apache.hadoop.mapreduce.InputFormat

org.apache.hadoop.mapreduce.InputFormat
InputFormat describes the input-specification for a Map-Reduce job.
The Map-Reduce framework relies on the InputFormat of the job to:
1. Validate the input-specification of the job.
2. Split-up the input file(s) into logical {@link InputSplit}s, each of which is then assigned to an individual {@link Mapper}.
3. Provide the {@link RecordReader} implementation to be used to gleaninput records from the logical InputSplit for processing by the {@link Mapper}.
The default behavior of file-based {@link InputFormat}s, typically sub-classes of {@link FileInputFormat}, is to split the input into logical {@link InputSplit}s based on the total size, in bytes, of the input files. However, the {@link FileSystem} blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size.

Clearly, logical splits based on input-size is insufficient for many applications since record boundaries are to respected. In such cases, the application has to also implement a {@link RecordReader} on whom lies theresponsibility to respect record-boundaries and present a record-oriented view of the logical InputSplit to the individual task. @see InputSplit @see RecordReader @see FileInputFormat

        PigInputFormat.mergeSplitSpecificConf(loadFunc, pigSplit, conf);


        // for backward compatibility
        PigInputFormat.sJob = conf;


        InputFormat inputFormat = loadFunc.getInputFormat();


        List<Long> inpLimitLists = 
                (ArrayList<Long>)ObjectSerializer.deserialize(
                        conf.get("pig.inpLimits"));

View Full Code Here

                        inputSpecificJob);
                // The above setLocation call could write to the conf within
                // the inputSpecificJob - use this updated conf


                // get the InputFormat from it and ask for splits
                InputFormat inpFormat = loadFunc.getInputFormat();
                List<InputSplit> oneInputSplits = inpFormat.getSplits(
                        HadoopShims.createJobContext(inputSpecificJob.getConfiguration(), 
                                jobcontext.getJobID()));
                List<InputSplit> oneInputPigSplits = getPigSplits(
                        oneInputSplits, i, inpTargets.get(i),
                        HadoopShims.getDefaultBlockSize(fs, isFsPath? path: fs.getWorkingDirectory()),

View Full Code Here

                            .instantiateFuncFromSpec(ld.getLFile()
                                    .getFuncSpec());
                            Job job = new Job(conf);
                            loader.setUDFContextSignature(ld.getSignature());
                            loader.setLocation(location, job);
                            InputFormat inf = loader.getInputFormat();
                            List<InputSplit> splits = inf.getSplits(HadoopShims.cloneJobContext(job));
                            List<List<InputSplit>> results = MapRedUtil
                            .getCombinePigSplits(splits,
                                    HadoopShims.getDefaultBlockSize(fs, path),
                                    conf);
                            numFiles += results.size();

View Full Code Here

   * IndexableLoadFunc interface implementation
   */
  @Override
  public void initialize(Configuration conf) throws IOException {
    try {
      InputFormat inputFormat = this.getInputFormat();
      TaskAttemptID id = TaskAttemptID.forName(conf.get("mapred.task.id"));
      
      if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
                    conf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
      }
      List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
      this.readers = new IndexedStorageRecordReader[fileSplits.size()];
      
      int idx = 0;
      Iterator<FileSplit> it = fileSplits.iterator();
      while (it.hasNext()) {
        FileSplit fileSplit = it.next();
        TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
        IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
        r.initialize(fileSplit, context);
        this.readers[idx] = r;
        idx++;
      }

View Full Code Here


    @SuppressWarnings("rawtypes")
    @Override
    public InputFormat getInputFormat() throws IOException {
        AvroStorageLog.funcCall("getInputFormat");
        InputFormat result = null;
        if(inputAvroSchema != null) {
            result = new PigAvroInputFormat(
            inputAvroSchema, ignoreBadFiles, schemaToMergedSchemaMap, useMultipleSchemas);
        } else {
            result = new TextInputFormat();

View Full Code Here

   */
  @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
  public static <K,V> void writePartitionFile(Job job, Sampler<K,V> sampler) 
      throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = 
        ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = sampler.getSample(inf, job);
    LOG.info("Using " + samples.length + " samples");
    RawComparator<K> comparator =

View Full Code Here

    conf1.set(DUMMY_KEY, "STATE1");
    TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);


    // This will create a CombineFileRecordReader that itself contains a
    // DummyRecordReader.
    InputFormat inputFormat = new ChildRRInputFormat();


    Path [] files = { new Path("file1") };
    long [] lengths = { 1 };


    CombineFileSplit split = new CombineFileSplit(files, lengths);


    RecordReader rr = inputFormat.createRecordReader(split, context1);
    assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);


    // Verify that the initial configuration is the one being used.
    // Right after construction the dummy key should have value "STATE1"
    assertEquals("Invalid initial dummy key value", "STATE1",

View Full Code Here

    Configuration conf = new Configuration();
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);


    // This will create a CombineFileRecordReader that itself contains a
    // DummyRecordReader.
    InputFormat inputFormat = new ChildRRInputFormat();


    Path [] files = { new Path("file1"), new Path("file2") };
    long [] lengths = { 1, 1 };


    CombineFileSplit split = new CombineFileSplit(files, lengths);
    RecordReader rr = inputFormat.createRecordReader(split, context);
    assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);


    // first initialize() call comes from MapTask. We'll do it here.
    rr.initialize(split, context);

View Full Code Here

    conf1.set(DUMMY_KEY, "STATE1");
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);


    // This will create a CombineFileRecordReader that itself contains a
    // DummyRecordReader.
    InputFormat inputFormat = new ChildRRInputFormat();


    Path [] files = { new Path("file1") };
    long [] lengths = { 1 };


    CombineFileSplit split = new CombineFileSplit(files, lengths);


    RecordReader rr = inputFormat.createRecordReader(split, context1);
    assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);


    // Verify that the initial configuration is the one being used.
    // Right after construction the dummy key should have value "STATE1"
    assertEquals("Invalid initial dummy key value", "STATE1",

View Full Code Here

    Configuration conf = new Configuration();
    TaskAttemptContext context = new TaskAttemptContext(conf, taskId);


    // This will create a CombineFileRecordReader that itself contains a
    // DummyRecordReader.
    InputFormat inputFormat = new ChildRRInputFormat();


    Path [] files = { new Path("file1"), new Path("file2") };
    long [] lengths = { 1, 1 };


    CombineFileSplit split = new CombineFileSplit(files, lengths);
    RecordReader rr = inputFormat.createRecordReader(split, context);
    assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);


    // first initialize() call comes from MapTask. We'll do it here.
    rr.initialize(split, context);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.mapreduce.InputFormat

bulkimport.BulkImportJobExample$VerboseInputSampler

com.datasalt.pangool.tuplemr.mapred.lib.input.DelegatingInputFormat

com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat

com.endgame.binarypig.loaders.AbstractExecutingLoaderTest

com.endgame.binarypig.loaders.BinarySequenceFileLoaderTest

org.apache.hadoop.hbase.mapreduce.hadoopbackport.InputSampler

org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat

org.apache.hadoop.mapreduce.lib.input.MultipleInputs

org.apache.hadoop.mapreduce.lib.input.TestCombineFileInputFormat

org.apache.hadoop.mapreduce.lib.partition.InputSampler

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.