Examples of org.apache.hadoop.mapred.InputFormat

org.apache.hadoop.mapred.InputFormat
InputFormat describes the input-specification for a Map-Reduce job.
The Map-Reduce framework relies on the InputFormat of the job to:
1. Validate the input-specification of the job.
2. Split-up the input file(s) into logical {@link InputSplit}s, each of which is then assigned to an individual {@link Mapper}.
3. Provide the {@link RecordReader} implementation to be used to gleaninput records from the logical InputSplit for processing by the {@link Mapper}.
The default behavior of file-based {@link InputFormat}s, typically sub-classes of {@link FileInputFormat}, is to split the input into logical {@link InputSplit}s based on the total size, in bytes, of the input files. However, the {@link FileSystem} blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size.

Clearly, logical splits based on input-size is insufficient for many applications since record boundaries are to respected. In such cases, the application has to also implement a {@link RecordReader} on whom lies theresponsibilty to respect record-boundaries and present a record-oriented view of the logical InputSplit to the individual task. @see InputSplit @see RecordReader @see JobClient @see FileInputFormat

    }


    pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath()
      .toString(), hsplit.getPath().toUri().getPath(), nonNative);


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    RecordReader innerReader = null;
    try {
      innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
      innerReader = HiveIOExceptionHandlerUtil
          .handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader<K,V> rr = new HiveRecordReader(innerReader, job);

View Full Code Here

    }


    pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath()
        .toString(), hsplit.getPath().toUri().getPath());


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


    BucketizedHiveRecordReader<K, V> rr= new BucketizedHiveRecordReader(inputFormat, hsplit, job,
        reporter);
    rr.initIOContext(hsplit, job, inputFormatClass);
    return rr;

View Full Code Here

    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      newjob.setInputFormat(inputFormat.getClass());


      FileStatus[] listStatus = listStatus(newjob, dir);


      for (FileStatus status : listStatus) {
        LOG.info("block size: " + status.getBlockSize());
        LOG.info("file length: " + status.getLen());
        FileInputFormat.setInputPaths(newjob, status.getPath());
        InputSplit[] iss = inputFormat.getSplits(newjob, 0);
        if (iss != null && iss.length > 0) {
          numOrigSplits += iss.length;
          result.add(new BucketizedHiveInputSplit(iss, inputFormatClass
              .getName()));
        }

View Full Code Here

      }
    }
    ru.delete(100, 0, 8);
    ru.close(false);


    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;


    // loop through the 5 splits and read each
    for(int i=0; i < 4; ++i) {
      System.out.println("starting split " + i);
      rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
      NullWritable key = rr.createKey();
      OrcStruct value = rr.createValue();


      // there should be exactly two rows per a split
      for(int j=0; j < 2; ++j) {
        System.out.println("i = " + i + ", j = " + j);
        assertEquals(true, rr.next(key, value));
        System.out.println("record = " + value);
        assertEquals(i + "." + j, value.getFieldValue(2).toString());
      }
      assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
  }

View Full Code Here

      }
    }
    ru.delete(100, 0, 8);
    ru.close(false);


    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;


    // loop through the 5 splits and read each
    for(int i=0; i < 4; ++i) {
      System.out.println("starting split " + i);
      rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
      NullWritable key = rr.createKey();
      OrcStruct value = rr.createValue();


      // there should be exactly two rows per a split
      for(int j=0; j < 2; ++j) {
        System.out.println("i = " + i + ", j = " + j);
        assertEquals(true, rr.next(key, value));
        System.out.println("record = " + value);
        assertEquals(i + "." + j, value.getFieldValue(2).toString());
      }
      assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
  }

View Full Code Here

    for(int i=0; i < values.length; ++i) {
      ru.insert(2, new MyRow(values[i]));
    }
    ru.close(false);


    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "1");
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    values = new String[]{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
    OrcStruct row = rr.createValue();
    for(int i = 0; i < values.length; ++i) {
      System.out.println("Checking " + i);
      assertEquals(true, rr.next(NullWritable.get(), row));

View Full Code Here

    ru = of.getRecordUpdater(root, options);
    values = new String[]{"6", "7", "8"};
    for(int i=0; i < values.length; ++i) {
      ru.insert(1, new MyRow(values[i]));
    }
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "2");


    // read the keys before the delta is flushed
    InputSplit[] splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =
        inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    System.out.println("Looking at split " + splits[0]);
    for(int i=1; i < 6; ++i) {
      System.out.println("Checking row " + i);
      assertEquals(true, rr.next(key, value));
      assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(key, value));


    ru.flush();
    ru.flush();
    values = new String[]{"9", "10"};
    for(int i=0; i < values.length; ++i) {
      ru.insert(3, new MyRow(values[i]));
    }
    ru.flush();


    splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    Path sideFile = new Path(root +
        "/delta_0000010_0000019/bucket_00001_flush_length");
    assertEquals(true, fs.exists(sideFile));
    assertEquals(24, fs.getFileStatus(sideFile).getLen());

View Full Code Here

          .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
              IOPrepareCache.get().allocatePartitionDescMap(), true);
      // create a new InputFormat instance if this is the first time to see this
      // class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);


      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
      for (InputSplit is : iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
    }
    return result.toArray(new HiveInputSplit[result.size()]);

View Full Code Here

      inputFormatClass = Class.forName(inputFormatClassName);
    } catch (ClassNotFoundException e) {
      throw new IOException("CombineHiveRecordReader: class not found "
          + inputFormatClassName);
    }
    InputFormat inputFormat = HiveInputFormat.getInputFormatFromCache(
        inputFormatClass, job);


    // create a split for the given partition
    FileSplit fsplit = new FileSplit(hsplit.getPaths()[partition], hsplit
        .getStartOffsets()[partition], hsplit.getLengths()[partition], hsplit
        .getLocations());


    this.recordReader = inputFormat.getRecordReader(fsplit, job, reporter);
    this.initIOContext(fsplit, job, inputFormatClass, this.recordReader);
  }

View Full Code Here

          JobConf jobConf = new JobConf(ctx.getConf());
          PartitionDesc partDesc = work.getPathToPartitionInfo().get(
              p.toString());
          Class<? extends InputFormat> inputFormatCls = partDesc
              .getInputFileFormatClass();
          InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
              inputFormatCls, jobConf);
          if(inputFormatObj instanceof ContentSummaryInputFormat) {
            cs = ((ContentSummaryInputFormat) inputFormatObj).getContentSummary(p, jobConf);
          } else {
            FileSystem fs = p.getFileSystem(ctx.getConf());

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.mapred.InputFormat

cascading.tap.hadoop.HadoopMR1TapPlatformTest

cascading.tap.hadoop.io.MultiInputFormat

com.backtype.hadoop.pail.PailFormatTester

com.ebay.erl.mobius.core.mapred.ConfigurableJob

com.facebook.hiveio.input.InputPartition

com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.CreateWordCountGraph

com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph.CreateLinkGraph

com.intel.hadoop.graphbuilder.test.job.PreprocessJobTest

com.taobao.zeus.web.PartitionDownloadServlet

org.apache.avro.mapred.DelegatingInputFormat

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.