Examples of org.apache.hadoop.mapred.InputFormat

org.apache.hadoop.mapred.InputFormat
InputFormat describes the input-specification for a Map-Reduce job.
The Map-Reduce framework relies on the InputFormat of the job to:
1. Validate the input-specification of the job.
2. Split-up the input file(s) into logical {@link InputSplit}s, each of which is then assigned to an individual {@link Mapper}.
3. Provide the {@link RecordReader} implementation to be used to gleaninput records from the logical InputSplit for processing by the {@link Mapper}.
The default behavior of file-based {@link InputFormat}s, typically sub-classes of {@link FileInputFormat}, is to split the input into logical {@link InputSplit}s based on the total size, in bytes, of the input files. However, the {@link FileSystem} blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size.

Clearly, logical splits based on input-size is insufficient for many applications since record boundaries are to respected. In such cases, the application has to also implement a {@link RecordReader} on whom lies theresponsibilty to respect record-boundaries and present a record-oriented view of the logical InputSplit to the individual task. @see InputSplit @see RecordReader @see JobClient @see FileInputFormat

    try {
      inputFormatClass = Class.forName(inputFormatClassName);
    } catch (ClassNotFoundException e) {
      throw new IOException ("CombineHiveRecordReader: class not found " + inputFormatClassName);
    }
    InputFormat inputFormat = CombineHiveInputFormat.getInputFormatFromCache(inputFormatClass, job);
    
    // create a split for the given partition
    FileSplit fsplit = new FileSplit(hsplit.getPaths()[partition],
                                     hsplit.getStartOffsets()[partition],
                                     hsplit.getLengths()[partition],
                                     hsplit.getLocations());
    
    this.recordReader = inputFormat.getRecordReader(fsplit, job, reporter);
  }

View Full Code Here

    //clone a jobConf for setting needed columns for reading
    JobConf cloneJobConf = new JobConf(job);
    initColumnsNeeded(cloneJobConf, inputFormatClass, hsplit.getPath().toString(), 
                      hsplit.getPath().toUri().getPath());


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, cloneJobConf);
    return new HiveRecordReader(inputFormat.getRecordReader(inputSplit,
        cloneJobConf, reporter));
  }

View Full Code Here

    // for each dir, get the InputFormat, and do getSplits.
    for(Path dir: dirs) {
      partitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      InputSplit[] iss = inputFormat.getSplits(newjob, numSplits/dirs.length);
      for(InputSplit is: iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
    }
    return result.toArray(new HiveInputSplit[result.size()]);

View Full Code Here


    // for each dir, get the InputFormat, and do validateInput.
    for (Path dir: dirs) {
      partitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this class
      InputFormat inputFormat = getInputFormatFromCache(part.getInputFileFormatClass(), job);


      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      ShimLoader.getHadoopShims().inputFormatValidateInput(inputFormat, newjob);
    }
  }

View Full Code Here

      }


      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once

View Full Code Here

              try {
                ContentSummary resultCs;


                Class<? extends InputFormat> inputFormatCls = partDesc
                    .getInputFileFormatClass();
                InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
                    inputFormatCls, myJobConf);
                if (inputFormatObj instanceof ContentSummaryInputFormat) {
                  resultCs = ((ContentSummaryInputFormat) inputFormatObj).getContentSummary(p,
                      myJobConf);
                } else {

View Full Code Here

      } finally {
        writer.close();
      }


      // try splitting the file in a variety of sizes
      InputFormat format = new SequenceFileInputFormat();
      RecInt key = new RecInt();
      RecBuffer value = new RecBuffer();
      for (int i = 0; i < 3; i++) {
        int numSplits =
          random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
        InputSplit[] splits = format.getSplits(job, numSplits);


        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader reader =
            format.getRecordReader(splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              assertFalse("Key in multiple partitions.", bits.get(key.getData()));
              bits.set(key.getData());

View Full Code Here

      } finally {
        writer.close();
      }


      // try splitting the file in a variety of sizes
      InputFormat format = new SequenceFileInputFormat();
      RecInt key = new RecInt();
      RecBuffer value = new RecBuffer();
      for (int i = 0; i < 3; i++) {
        int numSplits =
          random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
        //LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(job, numSplits);
        //LOG.info("splitting: got =        " + splits.length);


        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader reader =
            format.getRecordReader(splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              // if (bits.get(key.get())) {
              // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());

View Full Code Here

      inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
      throw new IOException("cannot find class " + inputFormatClassName);
    }


    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


    
    if (this.mrwork == null)
      init(job);
    JobConf jobConf = new JobConf(job);
    ArrayList<String> aliases = new ArrayList<String>();
    Iterator<Entry<String, ArrayList<String>>> iterator = this.mrwork
        .getPathToAliases().entrySet().iterator();
    String splitPath = hsplit.getPath().toString();
    String splitPathWithNoSchema = hsplit.getPath().toUri().getPath();
    while (iterator.hasNext()) {
      Entry<String, ArrayList<String>> entry = iterator.next();
      String key = entry.getKey();
      if (splitPath.startsWith(key) || splitPathWithNoSchema.startsWith(key)) {
        ArrayList<String> list = entry.getValue();
        for (String val : list)
          aliases.add(val);
      }
    }
    for (String alias : aliases) {
      Operator<? extends Serializable> op = this.mrwork.getAliasToWork().get(
          alias);
      if (op instanceof TableScanOperator) {
        TableScanOperator tableScan = (TableScanOperator) op;
        ArrayList<Integer> list = tableScan.getNeededColumnIDs();
        if (list != null)
          HiveFileFormatUtils.setReadColumnIDs(jobConf, list);
        else
          HiveFileFormatUtils.setFullyReadColumns(jobConf);
      }
    }
    return new HiveRecordReader(inputFormat.getRecordReader(inputSplit,
        jobConf, reporter));
  }

View Full Code Here

    // for each dir, get the InputFormat, and do getSplits.
    for(Path dir: dirs) {
      tableDesc table = getTableDescFromPath(dir);
      // create a new InputFormat instance if this is the first time to see this class
      Class inputFormatClass = table.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);


      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      InputSplit[] iss = inputFormat.getSplits(newjob, numSplits/dirs.length);
      for(InputSplit is: iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
    }
    return result.toArray(new HiveInputSplit[result.size()]);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.mapred.InputFormat

cascading.tap.hadoop.HadoopMR1TapPlatformTest

cascading.tap.hadoop.io.MultiInputFormat

com.backtype.hadoop.pail.PailFormatTester

com.ebay.erl.mobius.core.mapred.ConfigurableJob

com.facebook.hiveio.input.InputPartition

com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.CreateWordCountGraph

com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph.CreateLinkGraph

com.intel.hadoop.graphbuilder.test.job.PreprocessJobTest

com.taobao.zeus.web.PartitionDownloadServlet

org.apache.avro.mapred.DelegatingInputFormat

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.