Package org.apache.hadoop.mapred

Examples of org.apache.hadoop.mapred.InputFormat


    }

    pushProjectionsAndFilters(cloneJobConf, inputFormatClass, hsplit.getPath()
      .toString(), hsplit.getPath().toUri().getPath(), nonNative);

    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass,
        cloneJobConf);
    RecordReader innerReader = null;
    try {
      innerReader = inputFormat.getRecordReader(inputSplit,
        cloneJobConf, reporter);
    } catch (Exception e) {
      innerReader = HiveIOExceptionHandlerUtil
          .handleRecordReaderCreationException(e, cloneJobConf);
    }
View Full Code Here


    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      Class inputFormatClass = part.getInputFileFormatClass();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);

      // Make filter pushdown information available to getSplits.
      ArrayList<String> aliases =
        mrwork.getPathToAliases().get(dir.toUri().toString());
      if ((aliases != null) && (aliases.size() == 1)) {
        Operator op = mrwork.getAliasToWork().get(aliases.get(0));
        if ((op != null) && (op instanceof TableScanOperator)) {
          TableScanOperator tableScan = (TableScanOperator) op;
          pushFilters(newjob, tableScan);
        }
      }

      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
      for (InputSplit is : iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
    }
View Full Code Here

    // for each dir, get the InputFormat, and do validateInput.
    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      // create a new InputFormat instance if this is the first time to see this
      // class
      InputFormat inputFormat = getInputFormatFromCache(part
          .getInputFileFormatClass(), job);

      FileInputFormat.setInputPaths(newjob, dir);
      newjob.setInputFormat(inputFormat.getClass());
      ShimLoader.getHadoopShims().inputFormatValidateInput(inputFormat, newjob);
    }
  }
View Full Code Here

 
  @SuppressWarnings({ "rawtypes", "unchecked" })
  @Test(timeout=10000)
  public void testGroupedSplitSize() throws IOException {
    JobConf job = new JobConf(defaultConf);
    InputFormat mockWrappedFormat = mock(InputFormat.class);
    TezGroupedSplitsInputFormat<LongWritable , Text> format =
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(mockWrappedFormat);
   
    job.setLong(TezConfiguration.TEZ_AM_GROUPING_SPLIT_MAX_SIZE, 500*1000*1000l);
    job.setLong(TezConfiguration.TEZ_AM_GROUPING_SPLIT_MIN_SIZE, 50*1000*1000l);
    InputSplit mockSplit1 = mock(InputSplit.class);
    when(mockSplit1.getLength()).thenReturn(10*1000*1000l);
    when(mockSplit1.getLocations()).thenReturn(null);
    int numSplits = 100;
    InputSplit[] mockSplits = new InputSplit[numSplits];
    for (int i=0; i<numSplits; i++) {
      mockSplits[i] = mockSplit1;
    }
    when(mockWrappedFormat.getSplits((JobConf)anyObject(), anyInt())).thenReturn(mockSplits);
   
    // desired splits not set. We end up choosing min/max split size based on
    // total data and num original splits. In this case, min size will be hit
    InputSplit[] splits = format.getSplits(job, 0);
    Assert.assertEquals(25, splits.length);
View Full Code Here

  @SuppressWarnings({ "rawtypes", "unchecked" })
  @Test(timeout=10000)
  public void testGroupedSplitWithDuplicates() throws IOException {
    JobConf job = new JobConf(defaultConf);
    InputFormat mockWrappedFormat = mock(InputFormat.class);
    TezGroupedSplitsInputFormat<LongWritable , Text> format =
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(mockWrappedFormat);
   
    // put multiple splits with multiple copies in the same location
    String[] locations = {"common", "common", "common"};
    int numSplits = 3;
    InputSplit[] mockSplits = new InputSplit[numSplits];
    for (int i=0; i<numSplits; i++) {
      InputSplit mockSplit = mock(InputSplit.class);
      when(mockSplit.getLength()).thenReturn(10*1000*1000l);
      when(mockSplit.getLocations()).thenReturn(locations);
      mockSplits[i] = mockSplit;
    }
    when(mockWrappedFormat.getSplits((JobConf)anyObject(), anyInt())).thenReturn(mockSplits);
   
    format.setDesiredNumberOfSplits(1);
    InputSplit[] splits = format.getSplits(job, 1);
    Assert.assertEquals(1, splits.length);
    TezGroupedSplit split = (TezGroupedSplit) splits[0];
View Full Code Here

 
  @SuppressWarnings({ "rawtypes", "unchecked" })
  @Test(timeout=10000)
  public void testGroupedSplitWithBadLocations() throws IOException {
    JobConf job = new JobConf(defaultConf);
    InputFormat mockWrappedFormat = mock(InputFormat.class);
    TezGroupedSplitsInputFormat<LongWritable , Text> format =
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(mockWrappedFormat);
   
    // put multiple splits with multiple copies in the same location
    int numSplits = 3;
    InputSplit[] mockSplits = new InputSplit[numSplits];
    InputSplit mockSplit1 = mock(InputSplit.class);
    when(mockSplit1.getLength()).thenReturn(10*1000*1000l);
    when(mockSplit1.getLocations()).thenReturn(null);
    mockSplits[0] = mockSplit1;
    InputSplit mockSplit2 = mock(InputSplit.class);
    when(mockSplit2.getLength()).thenReturn(10*1000*1000l);
    when(mockSplit2.getLocations()).thenReturn(new String[] {null});
    mockSplits[1] = mockSplit2;
    InputSplit mockSplit3 = mock(InputSplit.class);
    when(mockSplit3.getLength()).thenReturn(10*1000*1000l);
    when(mockSplit3.getLocations()).thenReturn(new String[] {null, null});
    mockSplits[2] = mockSplit3;

    when(mockWrappedFormat.getSplits((JobConf)anyObject(), anyInt())).thenReturn(mockSplits);
   
    format.setDesiredNumberOfSplits(1);
    InputSplit[] splits = format.getSplits(job, 1);
    Assert.assertEquals(1, splits.length);
    TezGroupedSplit split = (TezGroupedSplit) splits[0];
View Full Code Here

      if (pd.getMinTransaction() < min) min = pd.getMinTransaction();
    }
    Assert.assertEquals(minTxn, min);
    Assert.assertEquals(maxTxn, max);

    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partLocation.toString());
    job.set("bucket_count", Integer.toString(buckets));
    job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
    InputSplit[] splits = inf.getSplits(job, 1);
    Assert.assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =
            inf.getRecordReader(splits[0], job, Reporter.NULL);

    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    for (int i = 0; i < records.length; i++) {
      Assert.assertEquals(true, rr.next(key, value));
View Full Code Here

    }

    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry :
        formatPaths.entrySet()) {
      Class<? extends InputFormat> formatClass = formatEntry.getKey();
      InputFormat format = (InputFormat) ReflectionUtils.newInstance(
         formatClass, conf);
      List<Path> paths = formatEntry.getValue();

      Map<Class<? extends Mapper>, List<Path>> mapperPaths
          = new HashMap<Class<? extends Mapper>, List<Path>>();

      // Now, for each set of paths that have a common InputFormat, build
      // a map of Mappers to the paths they're used for
      for (Path path : paths) {
       Class<? extends Mapper> mapperClass = mapperMap.get(path);
       if (!mapperPaths.containsKey(mapperClass)) {
         mapperPaths.put(mapperClass, new LinkedList<Path>());
       }

       mapperPaths.get(mapperClass).add(path);
      }

      // Now each set of paths that has a common InputFormat and Mapper can
      // be added to the same job, and split together.
      for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths
         .entrySet()) {
       paths = mapEntry.getValue();
       Class<? extends Mapper> mapperClass = mapEntry.getKey();

       if (mapperClass == null) {
         mapperClass = conf.getMapperClass();
       }

       FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths
           .size()]));

       // Get splits for each input path and tag with InputFormat
       // and Mapper types by wrapping in a TaggedInputSplit.
       InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
       for (InputSplit pathSplit : pathSplits) {
         splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(),
             mapperClass));
       }
      }
    }
View Full Code Here

  static Map<Path, InputFormat> getInputFormatMap(JobConf conf) {
    Map<Path, InputFormat> m = new HashMap<Path, InputFormat>();
    String[] pathMappings = conf.get("mapred.input.dir.formats").split(",");
    for (String pathMapping : pathMappings) {
      String[] split = pathMapping.split(";");
      InputFormat inputFormat;
      try {
       inputFormat = (InputFormat) ReflectionUtils.newInstance(conf
           .getClassByName(split[1]), conf);
      } catch (ClassNotFoundException e) {
       throw new RuntimeException(e);
View Full Code Here

      } finally {
        writer.close();
      }

      // try splitting the file in a variety of sizes
      InputFormat format = new SequenceFileInputFormat();
      RecInt key = new RecInt();
      RecBuffer value = new RecBuffer();
      for (int i = 0; i < 3; i++) {
        int numSplits =
          random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
        //LOG.info("splitting: requesting = " + numSplits);
        FileSplit[] splits = format.getSplits(fs, job, numSplits);
        //LOG.info("splitting: got =        " + splits.length);

        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader reader =
            format.getRecordReader(fs, splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              // if (bits.get(key.get())) {
              // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
View Full Code Here

TOP

Related Classes of org.apache.hadoop.mapred.InputFormat

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.