Examples of org.apache.hadoop.mapred.InputFormat

org.apache.hadoop.mapred.InputFormat
InputFormat describes the input-specification for a Map-Reduce job.
The Map-Reduce framework relies on the InputFormat of the job to:
1. Validate the input-specification of the job.
2. Split-up the input file(s) into logical {@link InputSplit}s, each of which is then assigned to an individual {@link Mapper}.
3. Provide the {@link RecordReader} implementation to be used to gleaninput records from the logical InputSplit for processing by the {@link Mapper}.
The default behavior of file-based {@link InputFormat}s, typically sub-classes of {@link FileInputFormat}, is to split the input into logical {@link InputSplit}s based on the total size, in bytes, of the input files. However, the {@link FileSystem} blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size.

Clearly, logical splits based on input-size is insufficient for many applications since record boundaries are to respected. In such cases, the application has to also implement a {@link RecordReader} on whom lies theresponsibilty to respect record-boundaries and present a record-oriented view of the logical InputSplit to the individual task. @see InputSplit @see RecordReader @see JobClient @see FileInputFormat

    }


    pushProjectionsAndFilters(cloneJobConf, inputFormatClass, hsplit.getPath()
      .toString(), hsplit.getPath().toUri().getPath(), nonNative);


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass,
        cloneJobConf);
    RecordReader innerReader = inputFormat.getRecordReader(inputSplit,
        cloneJobConf, reporter);


    HiveRecordReader<K,V> rr = new HiveRecordReader(innerReader);
    rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
    return rr;

View Full Code Here

    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);


      // Make filter pushdown information available to getSplits.
      ArrayList<String> aliases =
        mrwork.getPathToAliases().get(dir.toUri().toString());
      if ((aliases != null) && (aliases.size() == 1)) {
        Operator op = mrwork.getAliasToWork().get(aliases.get(0));
        if ((op != null) && (op instanceof TableScanOperator)) {
          TableScanOperator tableScan = (TableScanOperator) op;
          pushFilters(newjob, tableScan);
        }
      }


      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
      for (InputSplit is : iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
    }

View Full Code Here

    // for each dir, get the InputFormat, and do validateInput.
    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      InputFormat inputFormat = getInputFormatFromCache(part
          .getInputFileFormatClass(), job);


      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      ShimLoader.getHadoopShims().inputFormatValidateInput(inputFormat, newjob);
    }
  }

View Full Code Here

      }


      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once

View Full Code Here

      } finally {
        writer.close();
      }


      // try splitting the file in a variety of sizes
      InputFormat format = new SequenceFileInputFormat();
      RecInt key = new RecInt();
      RecBuffer value = new RecBuffer();
      for (int i = 0; i < 3; i++) {
        int numSplits =
          random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
        //LOG.info("splitting: requesting = " + numSplits);
        FileSplit[] splits = format.getSplits(fs, job, numSplits);
        //LOG.info("splitting: got =        " + splits.length);


        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader reader =
            format.getRecordReader(fs, splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              // if (bits.get(key.get())) {
              // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());

View Full Code Here

      } finally {
        writer.close();
      }


      // try splitting the file in a variety of sizes
      InputFormat format = new SequenceFileInputFormat();
      RecInt key = new RecInt();
      RecBuffer value = new RecBuffer();
      for (int i = 0; i < 3; i++) {
        int numSplits =
          random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
        //LOG.info("splitting: requesting = " + numSplits);
        FileSplit[] splits = format.getSplits(fs, job, numSplits);
        //LOG.info("splitting: got =        " + splits.length);


        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader reader =
            format.getRecordReader(fs, splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              // if (bits.get(key.get())) {
              // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());

View Full Code Here

    }


    pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath()
      .toString(), hsplit.getPath().toUri().getPath(), nonNative);


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    RecordReader innerReader = null;
    try {
      innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
      innerReader = HiveIOExceptionHandlerUtil
          .handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader<K,V> rr = new HiveRecordReader(innerReader, job);

View Full Code Here

    }


    pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath()
        .toString(), hsplit.getPath().toUri().getPath());


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


    BucketizedHiveRecordReader<K, V> rr= new BucketizedHiveRecordReader(inputFormat, hsplit, job,
        reporter);
    rr.initIOContext(hsplit, job, inputFormatClass);
    return rr;

View Full Code Here

    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      newjob.setInputFormat(inputFormat.getClass());


      FileStatus[] listStatus = listStatus(newjob, dir);


      for (FileStatus status : listStatus) {
        LOG.info("block size: " + status.getBlockSize());
        LOG.info("file length: " + status.getLen());
        FileInputFormat.setInputPaths(newjob, status.getPath());
        InputSplit[] iss = inputFormat.getSplits(newjob, 0);
        if (iss != null && iss.length > 0) {
          numOrigSplits += iss.length;
          result.add(new BucketizedHiveInputSplit(iss, inputFormatClass
              .getName()));
        }

View Full Code Here

    // from Utilities.getInputSummary()
    private long getFileLength(JobConf conf, Path path, Class<? extends InputFormat> clazz)
        throws IOException {
      ContentSummary summary;
      if (ContentSummaryInputFormat.class.isAssignableFrom(clazz)) {
        InputFormat input = HiveInputFormat.getInputFormatFromCache(clazz, conf);
        summary = ((ContentSummaryInputFormat)input).getContentSummary(path, conf);
      } else {
        FileSystem fs = path.getFileSystem(conf);
        try {
          summary = fs.getContentSummary(path);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.mapred.InputFormat

cascading.tap.hadoop.HadoopMR1TapPlatformTest

cascading.tap.hadoop.io.MultiInputFormat

com.backtype.hadoop.pail.PailFormatTester

com.ebay.erl.mobius.core.mapred.ConfigurableJob

com.facebook.hiveio.input.InputPartition

com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.CreateWordCountGraph

com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph.CreateLinkGraph

com.intel.hadoop.graphbuilder.test.job.PreprocessJobTest

com.taobao.zeus.web.PartitionDownloadServlet

org.apache.avro.mapred.DelegatingInputFormat

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.