Package org.apache.hadoop.mapred

Examples of org.apache.hadoop.mapred.InputFormat


      }

      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);

      // Since there is no easy way of knowing whether MAPREDUCE-1597 is present in the tree or not,
      // we use a configuration variable for the same
      if (this.mrwork != null && !this.mrwork.getHadoopSupportsSplittable()) {
        // The following code should be removed, once
View Full Code Here


    }

    pushProjectionsAndFilters(cloneJobConf, inputFormatClass, hsplit.getPath()
      .toString(), hsplit.getPath().toUri().getPath(), nonNative);

    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass,
        cloneJobConf);
    RecordReader innerReader = null;
    try {
      innerReader = inputFormat.getRecordReader(inputSplit,
        cloneJobConf, reporter);
    } catch (Exception e) {
      innerReader = HiveIOExceptionHandlerUtil
          .handleRecordReaderCreationException(e, cloneJobConf);
    }
View Full Code Here

    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);

      // Make filter pushdown information available to getSplits.
      ArrayList<String> aliases =
        mrwork.getPathToAliases().get(dir.toUri().toString());
      if ((aliases != null) && (aliases.size() == 1)) {
        Operator op = mrwork.getAliasToWork().get(aliases.get(0));
        if ((op != null) && (op instanceof TableScanOperator)) {
          TableScanOperator tableScan = (TableScanOperator) op;
          pushFilters(newjob, tableScan);
        }
      }

      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
      for (InputSplit is : iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
    }
View Full Code Here

    // for each dir, get the InputFormat, and do validateInput.
    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      InputFormat inputFormat = getInputFormatFromCache(part
          .getInputFileFormatClass(), job);

      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      ShimLoader.getHadoopShims().inputFormatValidateInput(inputFormat, newjob);
    }
  }
View Full Code Here

              try {
                ContentSummary resultCs;

                Class<? extends InputFormat> inputFormatCls = partDesc
                    .getInputFileFormatClass();
                InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
                    inputFormatCls, myJobConf);
                if (inputFormatObj instanceof ContentSummaryInputFormat) {
                  resultCs = ((ContentSummaryInputFormat) inputFormatObj).getContentSummary(p,
                      myJobConf);
                } else {
View Full Code Here

      }
    }
    ru.delete(100, 0, 8);
    ru.close(false);

    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;

    // loop through the 5 splits and read each
    for(int i=0; i < 4; ++i) {
      System.out.println("starting split " + i);
      rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
      NullWritable key = rr.createKey();
      OrcStruct value = rr.createValue();

      // there should be exactly two rows per a split
      for(int j=0; j < 2; ++j) {
        System.out.println("i = " + i + ", j = " + j);
        assertEquals(true, rr.next(key, value));
        System.out.println("record = " + value);
        assertEquals(i + "." + j, value.getFieldValue(2).toString());
      }
      assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
  }
View Full Code Here

      }
    }
    ru.delete(100, 0, 8);
    ru.close(false);

    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;

    // loop through the 5 splits and read each
    for(int i=0; i < 4; ++i) {
      System.out.println("starting split " + i);
      rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
      NullWritable key = rr.createKey();
      OrcStruct value = rr.createValue();

      // there should be exactly two rows per a split
      for(int j=0; j < 2; ++j) {
        System.out.println("i = " + i + ", j = " + j);
        assertEquals(true, rr.next(key, value));
        System.out.println("record = " + value);
        assertEquals(i + "." + j, value.getFieldValue(2).toString());
      }
      assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
  }
View Full Code Here

    for(int i=0; i < values.length; ++i) {
      ru.insert(2, new MyRow(values[i]));
    }
    ru.close(false);

    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "1");
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    values = new String[]{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
    OrcStruct row = rr.createValue();
    for(int i = 0; i < values.length; ++i) {
      System.out.println("Checking " + i);
      assertEquals(true, rr.next(NullWritable.get(), row));
View Full Code Here

    ru = of.getRecordUpdater(root, options);
    values = new String[]{"6", "7", "8"};
    for(int i=0; i < values.length; ++i) {
      ru.insert(1, new MyRow(values[i]));
    }
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "2");

    // read the keys before the delta is flushed
    InputSplit[] splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =
        inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    System.out.println("Looking at split " + splits[0]);
    for(int i=1; i < 6; ++i) {
      System.out.println("Checking row " + i);
      assertEquals(true, rr.next(key, value));
      assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(key, value));

    ru.flush();
    ru.flush();
    values = new String[]{"9", "10"};
    for(int i=0; i < values.length; ++i) {
      ru.insert(3, new MyRow(values[i]));
    }
    ru.flush();

    splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    Path sideFile = new Path(root +
        "/delta_0000010_0000019/bucket_00001_flush_length");
    assertEquals(true, fs.exists(sideFile));
    assertEquals(24, fs.getFileStatus(sideFile).getLen());
View Full Code Here

            @Override
            public void run() {
              try {
                Class<? extends InputFormat> inputFormatCls = partDesc
                    .getInputFileFormatClass();
                InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
                    inputFormatCls, myJobConf);
                if (inputFormatObj instanceof ContentSummaryInputFormat) {
                  ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
                  resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
                  return;
View Full Code Here

TOP

Related Classes of org.apache.hadoop.mapred.InputFormat

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.