Source Code of org.apache.tez.mapreduce.hadoop.MRInputHelpers

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.tez.mapreduce.hadoop;


import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;


import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.protobuf.ByteString;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.mapreduce.split.JobSplitWriter;
import org.apache.hadoop.mapreduce.split.TezGroupedSplit;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.InputDescriptor;
import org.apache.tez.dag.api.TaskLocationHint;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.dag.api.VertexLocationHint;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.input.MRInputLegacy;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos;


@Public
@Unstable
public class MRInputHelpers {


  private static final Log LOG = LogFactory.getLog(MRInputHelpers.class);
  private static final int SPLIT_SERIALIZED_LENGTH_ESTIMATE = 40;
  static final String JOB_SPLIT_RESOURCE_NAME = "job.split";
  static final String JOB_SPLIT_METAINFO_RESOURCE_NAME = "job.splitmetainfo";


  /**
   * Setup split generation on the client, with splits being distributed via the traditional
   * MapReduce mechanism of distributing splits via the Distributed Cache.
   * <p/>
   * Usage of this technique for handling splits is not advised. Instead, splits should be either
   * generated in the AM, or generated in the client and distributed via the AM. See {@link
   * org.apache.tez.mapreduce.input.MRInput.MRInputConfigBuilder}
   * <p/>
   * Note: Attempting to use this method to add multiple Inputs to a Vertex is not supported.
   *
   * This mechanism of propagating splits may be removed in a subsequent release, and is not recommended.
   *
   * @param conf           configuration to be used by {@link org.apache.tez.mapreduce.input.MRInput}.
   *                       This is expected to be fully configured.
   * @param splitsDir      the path to which splits will be generated.
   * @param useLegacyInput whether to use {@link org.apache.tez.mapreduce.input.MRInputLegacy} or
   *                       {@link org.apache.tez.mapreduce.input.MRInput}
   * @return an instance of {@link org.apache.tez.dag.api.DataSourceDescriptor} which can be added
   * as a data source to a {@link org.apache.tez.dag.api.Vertex}
   */
  @InterfaceStability.Unstable
  public static DataSourceDescriptor configureMRInputWithLegacySplitGeneration(Configuration conf,
                                                                               Path splitsDir,
                                                                               boolean useLegacyInput) {
    InputSplitInfo inputSplitInfo = null;
    try {
      inputSplitInfo = generateInputSplits(conf, splitsDir);


      InputDescriptor inputDescriptor = InputDescriptor.create(useLegacyInput ? MRInputLegacy.class
          .getName() : MRInput.class.getName())
          .setUserPayload(createMRInputPayload(conf, null));
      Map<String, LocalResource> additionalLocalResources = new HashMap<String, LocalResource>();
      updateLocalResourcesForInputSplits(FileSystem.get(conf), inputSplitInfo,
          additionalLocalResources);
      DataSourceDescriptor dsd =
          DataSourceDescriptor.create(inputDescriptor, null, inputSplitInfo.getNumTasks(),
              inputSplitInfo.getCredentials(),
              VertexLocationHint.create(inputSplitInfo.getTaskLocationHints()),
              additionalLocalResources);
      return dsd;
    } catch (IOException e) {
      throw new TezUncheckedException("Failed to generate InputSplits", e);
    } catch (InterruptedException e) {
      throw new TezUncheckedException("Failed to generate InputSplits", e);
    } catch (ClassNotFoundException e) {
      throw new TezUncheckedException("Failed to generate InputSplits", e);
    }
  }




  /**
   * Parse the payload used by MRInputPayload
   *
   * @param payload the {@link org.apache.tez.dag.api.UserPayload} instance
   * @return an instance of {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto},
   * which provides access to the underlying configuration bytes
   * @throws IOException
   */
  @InterfaceStability.Evolving
  public static MRRuntimeProtos.MRInputUserPayloadProto parseMRInputPayload(UserPayload payload)
      throws IOException {
    return MRRuntimeProtos.MRInputUserPayloadProto.parseFrom(ByteString.copyFrom(payload.getPayload()));
  }


  /**
   * Create an instance of {@link org.apache.hadoop.mapred.InputSplit} from the {@link
   * org.apache.tez.mapreduce.input.MRInput} representation of a split.
   *
   * @param splitProto           The {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto}
   *                             instance representing the split
   * @param serializationFactory the serialization mechanism used to write out the split
   * @return an instance of the split
   * @throws java.io.IOException
   */
  @SuppressWarnings("unchecked")
  @InterfaceStability.Evolving
  public static InputSplit createOldFormatSplitFromUserPayload(
      MRRuntimeProtos.MRSplitProto splitProto, SerializationFactory serializationFactory)
      throws IOException {
    // This may not need to use serialization factory, since OldFormat
    // always uses Writable to write splits.
    Preconditions.checkNotNull(splitProto, "splitProto cannot be null");
    String className = splitProto.getSplitClassName();
    Class<InputSplit> clazz;


    try {
      clazz = (Class<InputSplit>) Class.forName(className);
    } catch (ClassNotFoundException e) {
      throw new IOException("Failed to load InputSplit class: [" + className + "]", e);
    }


    Deserializer<InputSplit> deserializer = serializationFactory
        .getDeserializer(clazz);
    deserializer.open(splitProto.getSplitBytes().newInput());
    InputSplit inputSplit = deserializer.deserialize(null);
    deserializer.close();
    return inputSplit;
  }


  /**
   * Create an instance of {@link org.apache.hadoop.mapreduce.InputSplit} from the {@link
   * org.apache.tez.mapreduce.input.MRInput} representation of a split.
   *
   * @param splitProto           The {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto}
   *                             instance representing the split
   * @param serializationFactory the serialization mechanism used to write out the split
   * @return an instance of the split
   * @throws IOException
   */
  @InterfaceStability.Evolving
  @SuppressWarnings("unchecked")
  public static org.apache.hadoop.mapreduce.InputSplit createNewFormatSplitFromUserPayload(
      MRRuntimeProtos.MRSplitProto splitProto, SerializationFactory serializationFactory)
      throws IOException {
    Preconditions.checkNotNull(splitProto, "splitProto must be specified");
    String className = splitProto.getSplitClassName();
    Class<org.apache.hadoop.mapreduce.InputSplit> clazz;


    try {
      clazz = (Class<org.apache.hadoop.mapreduce.InputSplit>) Class
          .forName(className);
    } catch (ClassNotFoundException e) {
      throw new IOException("Failed to load InputSplit class: [" + className + "]", e);
    }


    Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = serializationFactory
        .getDeserializer(clazz);
    deserializer.open(splitProto.getSplitBytes().newInput());
    org.apache.hadoop.mapreduce.InputSplit inputSplit = deserializer
        .deserialize(null);
    deserializer.close();
    return inputSplit;
  }


  @InterfaceStability.Evolving
  public static <T extends org.apache.hadoop.mapreduce.InputSplit> MRRuntimeProtos.MRSplitProto createSplitProto(
      T newSplit, SerializationFactory serializationFactory)
      throws IOException, InterruptedException {
    MRRuntimeProtos.MRSplitProto.Builder builder = MRRuntimeProtos.MRSplitProto
        .newBuilder();


    builder.setSplitClassName(newSplit.getClass().getName());


    @SuppressWarnings("unchecked")
    Serializer<T> serializer = serializationFactory
        .getSerializer((Class<T>) newSplit.getClass());
    ByteString.Output out = ByteString
        .newOutput(SPLIT_SERIALIZED_LENGTH_ESTIMATE);
    serializer.open(out);
    serializer.serialize(newSplit);
    // TODO MR Compat: Check against max block locations per split.
    ByteString splitBs = out.toByteString();
    builder.setSplitBytes(splitBs);


    return builder.build();
  }


  @InterfaceStability.Evolving
  public static MRRuntimeProtos.MRSplitProto createSplitProto(
      org.apache.hadoop.mapred.InputSplit oldSplit) throws IOException {
    MRRuntimeProtos.MRSplitProto.Builder builder = MRRuntimeProtos.MRSplitProto.newBuilder();


    builder.setSplitClassName(oldSplit.getClass().getName());


    ByteString.Output os = ByteString
        .newOutput(SPLIT_SERIALIZED_LENGTH_ESTIMATE);
    oldSplit.write(new DataOutputStream(os));
    ByteString splitBs = os.toByteString();
    builder.setSplitBytes(splitBs);


    return builder.build();
  }


  /**
   * Generates Input splits and stores them in a {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto} instance.
   *
   * Returns an instance of {@link InputSplitInfoMem}
   *
   * With grouping enabled, the eventual configuration used by the tasks, will have
   * the user-specified InputFormat replaced by either {@link org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat}
   * or {@link org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat}
   *
   * @param conf
   *          an instance of Configuration which is used to determine whether
   *          the mapred of mapreduce API is being used. This Configuration
   *          instance should also contain adequate information to be able to
   *          generate splits - like the InputFormat being used and related
   *          configuration.
   * @param groupSplits whether to group the splits or not
   * @param targetTasks the number of target tasks if grouping is enabled. Specify as 0 otherwise.
   * @return an instance of {@link InputSplitInfoMem} which supports a subset of
   *         the APIs defined on {@link InputSplitInfo}
   * @throws IOException
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
  @InterfaceStability.Unstable
  public static InputSplitInfoMem generateInputSplitsToMem(Configuration conf, boolean groupSplits,
                                                           int targetTasks)
      throws IOException, ClassNotFoundException, InterruptedException {


    InputSplitInfoMem splitInfoMem = null;
    JobConf jobConf = new JobConf(conf);
    if (jobConf.getUseNewMapper()) {
      LOG.info("Generating mapreduce api input splits");
      Job job = Job.getInstance(conf);
      org.apache.hadoop.mapreduce.InputSplit[] splits =
          generateNewSplits(job, groupSplits, targetTasks);
      splitInfoMem = new InputSplitInfoMem(splits, createTaskLocationHintsFromSplits(splits),
          splits.length, job.getCredentials(), job.getConfiguration());
    } else {
      LOG.info("Generating mapred api input splits");
      org.apache.hadoop.mapred.InputSplit[] splits =
          generateOldSplits(jobConf, groupSplits, targetTasks);
      splitInfoMem = new InputSplitInfoMem(splits, createTaskLocationHintsFromSplits(splits),
          splits.length, jobConf.getCredentials(), jobConf);
    }
    LOG.info("NumSplits: " + splitInfoMem.getNumTasks() + ", SerializedSize: "
        + splitInfoMem.getSplitsProto().getSerializedSize());
    return splitInfoMem;
  }


  private static List<TaskLocationHint> createTaskLocationHintsFromSplits(
      org.apache.hadoop.mapreduce.InputSplit[] newFormatSplits) {
    Iterable<TaskLocationHint> iterable = Iterables
        .transform(Arrays.asList(newFormatSplits),
            new Function<org.apache.hadoop.mapreduce.InputSplit, TaskLocationHint>() {
              @Override


              public TaskLocationHint apply(
                  org.apache.hadoop.mapreduce.InputSplit input) {
                try {
                  if (input instanceof TezGroupedSplit) {
                    String rack =
                        ((org.apache.hadoop.mapreduce.split.TezGroupedSplit) input).getRack();
                    if (rack == null) {
                      if (input.getLocations() != null) {
                        return TaskLocationHint.createTaskLocationHint(
                            new HashSet<String>(Arrays.asList(input.getLocations())), null);
                      } else {
                        return TaskLocationHint.createTaskLocationHint(null, null);
                      }
                    } else {
                      return TaskLocationHint.createTaskLocationHint(null,
                          Collections.singleton(rack));
                    }
                  } else {
                    return TaskLocationHint.createTaskLocationHint(
                        new HashSet<String>(Arrays.asList(input.getLocations())), null);
                  }
                } catch (IOException e) {
                  throw new RuntimeException(e);
                } catch (InterruptedException e) {
                  throw new RuntimeException(e);
                }
              }
            });
    return Lists.newArrayList(iterable);
  }


  private static List<TaskLocationHint> createTaskLocationHintsFromSplits(
      org.apache.hadoop.mapred.InputSplit[] oldFormatSplits) {
    Iterable<TaskLocationHint> iterable = Iterables.transform(Arrays.asList(oldFormatSplits),
        new Function<org.apache.hadoop.mapred.InputSplit, TaskLocationHint>() {
          @Override
          public TaskLocationHint apply(org.apache.hadoop.mapred.InputSplit input) {
            try {
              if (input instanceof org.apache.hadoop.mapred.split.TezGroupedSplit) {
                String rack = ((org.apache.hadoop.mapred.split.TezGroupedSplit) input).getRack();
                if (rack == null) {
                  if (input.getLocations() != null) {
                    return TaskLocationHint.createTaskLocationHint(new HashSet<String>(Arrays.asList(
                        input.getLocations())), null);
                  } else {
                    return TaskLocationHint.createTaskLocationHint(null, null);
                  }
                } else {
                  return TaskLocationHint.createTaskLocationHint(null, Collections.singleton(rack));
                }
              } else {
                return TaskLocationHint.createTaskLocationHint(
                    new HashSet<String>(Arrays.asList(input.getLocations())),
                    null);
              }
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
          }
        });
    return Lists.newArrayList(iterable);
  }


  @SuppressWarnings({ "rawtypes", "unchecked" })
  private static org.apache.hadoop.mapreduce.InputSplit[] generateNewSplits(
      JobContext jobContext, boolean groupSplits, int numTasks)
      throws ClassNotFoundException, IOException,
      InterruptedException {
    Configuration conf = jobContext.getConfiguration();




    // This is the real input format.
    org.apache.hadoop.mapreduce.InputFormat<?, ?> inputFormat = null;
    try {
      inputFormat = ReflectionUtils.newInstance(jobContext.getInputFormatClass(), conf);
    } catch (ClassNotFoundException e) {
      throw new TezUncheckedException(e);
    }


    org.apache.hadoop.mapreduce.InputFormat<?, ?> finalInputFormat = inputFormat;


    // For grouping, the underlying InputFormatClass class is passed in as a parameter.
    // JobContext has this setup as TezGroupedSplitInputFormat
    if (groupSplits) {
      org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat groupedFormat =
          new org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat();
      groupedFormat.setConf(conf);
      groupedFormat.setInputFormat(inputFormat);
      groupedFormat.setDesiredNumberOfSplits(numTasks);
      finalInputFormat = groupedFormat;
    } else {
      finalInputFormat = inputFormat;
    }


    List<org.apache.hadoop.mapreduce.InputSplit> array = finalInputFormat
        .getSplits(jobContext);
    org.apache.hadoop.mapreduce.InputSplit[] splits = (org.apache.hadoop.mapreduce.InputSplit[]) array
        .toArray(new org.apache.hadoop.mapreduce.InputSplit[array.size()]);


    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new InputSplitComparator());
    return splits;
  }


  @SuppressWarnings({ "rawtypes", "unchecked" })
  private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
      JobConf jobConf, boolean groupSplits, int numTasks) throws IOException {


    // This is the real InputFormat
    org.apache.hadoop.mapred.InputFormat inputFormat;
    try {
      inputFormat = jobConf.getInputFormat();
    } catch (Exception e) {
      throw new TezUncheckedException(e);
    }


    org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;


    if (groupSplits) {
      org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat =
          new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
      groupedFormat.setConf(jobConf);
      groupedFormat.setInputFormat(inputFormat);
      groupedFormat.setDesiredNumberOfSplits(numTasks);
      finalInputFormat = groupedFormat;
    } else {
      finalInputFormat = inputFormat;
    }
    org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
        .getSplits(jobConf, jobConf.getNumMapTasks());
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
    return splits;
  }


  /**
   * Comparator for org.apache.hadoop.mapreduce.InputSplit
   */
  private static class InputSplitComparator
      implements Comparator<org.apache.hadoop.mapreduce.InputSplit> {
    @Override
    public int compare(org.apache.hadoop.mapreduce.InputSplit o1,
                       org.apache.hadoop.mapreduce.InputSplit o2) {
      try {
        long len1 = o1.getLength();
        long len2 = o2.getLength();
        if (len1 < len2) {
          return 1;
        } else if (len1 == len2) {
          return 0;
        } else {
          return -1;
        }
      } catch (IOException ie) {
        throw new RuntimeException("exception in InputSplit compare", ie);
      } catch (InterruptedException ie) {
        throw new RuntimeException("exception in InputSplit compare", ie);
      }
    }
  }


  /**
   * Comparator for org.apache.hadoop.mapred.InputSplit
   */
  private static class OldInputSplitComparator
      implements Comparator<org.apache.hadoop.mapred.InputSplit> {
    @Override
    public int compare(org.apache.hadoop.mapred.InputSplit o1,
                       org.apache.hadoop.mapred.InputSplit o2) {
      try {
        long len1 = o1.getLength();
        long len2 = o2.getLength();
        if (len1 < len2) {
          return 1;
        } else if (len1 == len2) {
          return 0;
        } else {
          return -1;
        }
      } catch (IOException ie) {
        throw new RuntimeException("Problem getting input split size", ie);
      }
    }
  }


  /**
   * Generate new-api mapreduce InputFormat splits
   * @param jobContext JobContext required by InputFormat
   * @param inputSplitDir Directory in which to generate splits information
   *
   * @return InputSplitInfo containing the split files' information and the
   * location hints for each split generated to be used to determining parallelism of
   * the map stage.
   *
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  private static InputSplitInfoDisk writeNewSplits(JobContext jobContext,
                                                   Path inputSplitDir) throws IOException, InterruptedException,
      ClassNotFoundException {


    org.apache.hadoop.mapreduce.InputSplit[] splits =
        generateNewSplits(jobContext, false, 0);


    Configuration conf = jobContext.getConfiguration();


    JobSplitWriter.createSplitFiles(inputSplitDir, conf,
        inputSplitDir.getFileSystem(conf), splits);


    List<TaskLocationHint> locationHints =
        new ArrayList<TaskLocationHint>(splits.length);
    for (int i = 0; i < splits.length; ++i) {
      locationHints.add(
          TaskLocationHint.createTaskLocationHint(new HashSet<String>(
              Arrays.asList(splits[i].getLocations())), null)
      );
    }


    return new InputSplitInfoDisk(
        JobSubmissionFiles.getJobSplitFile(inputSplitDir),
        JobSubmissionFiles.getJobSplitMetaFile(inputSplitDir),
        splits.length, locationHints, jobContext.getCredentials());
  }


  /**
   * Generate old-api mapred InputFormat splits
   * @param jobConf JobConf required by InputFormat class
   * @param inputSplitDir Directory in which to generate splits information
   *
   * @return InputSplitInfo containing the split files' information and the
   * number of splits generated to be used to determining parallelism of
   * the map stage.
   *
   * @throws IOException
   */
  private static InputSplitInfoDisk writeOldSplits(JobConf jobConf,
                                                   Path inputSplitDir) throws IOException {


    org.apache.hadoop.mapred.InputSplit[] splits =
        generateOldSplits(jobConf, false, 0);


    JobSplitWriter.createSplitFiles(inputSplitDir, jobConf,
        inputSplitDir.getFileSystem(jobConf), splits);


    List<TaskLocationHint> locationHints =
        new ArrayList<TaskLocationHint>(splits.length);
    for (int i = 0; i < splits.length; ++i) {
      locationHints.add(
          TaskLocationHint.createTaskLocationHint(new HashSet<String>(
              Arrays.asList(splits[i].getLocations())), null)
      );
    }


    return new InputSplitInfoDisk(
        JobSubmissionFiles.getJobSplitFile(inputSplitDir),
        JobSubmissionFiles.getJobSplitMetaFile(inputSplitDir),
        splits.length, locationHints, jobConf.getCredentials());
  }


  /**
   * Helper api to generate splits
   * @param conf Configuration with all necessary information set to generate
   * splits. The following are required at a minimum:
   *
   *   - mapred.mapper.new-api: determine whether mapred.InputFormat or
   *     mapreduce.InputFormat is to be used
   *   - mapred.input.format.class or mapreduce.job.inputformat.class:
   *     determines the InputFormat class to be used
   *
   * In addition to this, all the configs needed by the InputFormat class also
   * have to be set. For example, FileInputFormat needs the input directory
   * paths to be set in the config.
   *
   * @param inputSplitsDir Directory in which the splits file and meta info file
   * will be generated. job.split and job.splitmetainfo files in this directory
   * will be overwritten. Should be a fully-qualified path.
   *
   * @return InputSplitInfo containing the split files' information and the
   * number of splits generated to be used to determining parallelism of
   * the map stage.
   *
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  private static InputSplitInfoDisk generateInputSplits(Configuration conf,
                                                        Path inputSplitsDir) throws IOException, InterruptedException,
      ClassNotFoundException {
    Job job = Job.getInstance(conf);
    JobConf jobConf = new JobConf(conf);
    conf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);
    if (jobConf.getUseNewMapper()) {
      LOG.info("Generating new input splits"
          + ", splitsDir=" + inputSplitsDir.toString());
      return writeNewSplits(job, inputSplitsDir);
    } else {
      LOG.info("Generating old input splits"
          + ", splitsDir=" + inputSplitsDir.toString());
      return writeOldSplits(jobConf, inputSplitsDir);
    }
  }


  /**
   * Update provided localResources collection with the required local
   * resources needed by MapReduce tasks with respect to Input splits.
   *
   * @param fs Filesystem instance to access status of splits related files
   * @param inputSplitInfo Information on location of split files
   * @param localResources LocalResources collection to be updated
   * @throws IOException
   */
  private static void updateLocalResourcesForInputSplits(
      FileSystem fs,
      InputSplitInfo inputSplitInfo,
      Map<String, LocalResource> localResources) throws IOException {
    if (localResources.containsKey(JOB_SPLIT_RESOURCE_NAME)) {
      throw new RuntimeException("LocalResources already contains a"
          + " resource named " + JOB_SPLIT_RESOURCE_NAME);
    }
    if (localResources.containsKey(JOB_SPLIT_METAINFO_RESOURCE_NAME)) {
      throw new RuntimeException("LocalResources already contains a"
          + " resource named " + JOB_SPLIT_METAINFO_RESOURCE_NAME);
    }


    FileStatus splitFileStatus =
        fs.getFileStatus(inputSplitInfo.getSplitsFile());
    FileStatus metaInfoFileStatus =
        fs.getFileStatus(inputSplitInfo.getSplitsMetaInfoFile());
    localResources.put(JOB_SPLIT_RESOURCE_NAME,
        LocalResource.newInstance(
            ConverterUtils.getYarnUrlFromPath(inputSplitInfo.getSplitsFile()),
            LocalResourceType.FILE,
            LocalResourceVisibility.APPLICATION,
            splitFileStatus.getLen(), splitFileStatus.getModificationTime()));
    localResources.put(JOB_SPLIT_METAINFO_RESOURCE_NAME,
        LocalResource.newInstance(
            ConverterUtils.getYarnUrlFromPath(
                inputSplitInfo.getSplitsMetaInfoFile()),
            LocalResourceType.FILE,
            LocalResourceVisibility.APPLICATION,
            metaInfoFileStatus.getLen(),
            metaInfoFileStatus.getModificationTime()));
  }


  /**
   * Called to specify that grouping of input splits be performed by Tez
   * The conf should have the input format class configuration
   * set to the TezGroupedSplitsInputFormat. The real input format class name
   * should be passed as an argument to this method.
   * <p/>
   * With grouping enabled, the eventual configuration used by the tasks, will have
   * the user-specified InputFormat replaced by either {@link org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat}
   * or {@link org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat}
   */
  @InterfaceAudience.Private
  protected static UserPayload createMRInputPayloadWithGrouping(Configuration conf) throws IOException {
    Preconditions
        .checkArgument(conf != null, "Configuration must be specified");
    return createMRInputPayload(TezUtils.createByteStringFromConf(conf),
        null, true);
  }


  @InterfaceAudience.Private
  protected static UserPayload createMRInputPayload(Configuration conf,
                                               MRRuntimeProtos.MRSplitsProto mrSplitsProto) throws
      IOException {
    Preconditions
        .checkArgument(conf != null, "Configuration must be specified");


    return createMRInputPayload(TezUtils.createByteStringFromConf(conf),
        mrSplitsProto, false);
  }


  private static UserPayload createMRInputPayload(ByteString bytes,
                                             MRRuntimeProtos.MRSplitsProto mrSplitsProto,
                                             boolean isGrouped) throws IOException {
    MRRuntimeProtos.MRInputUserPayloadProto.Builder userPayloadBuilder =
        MRRuntimeProtos.MRInputUserPayloadProto
            .newBuilder();
    userPayloadBuilder.setConfigurationBytes(bytes);
    if (mrSplitsProto != null) {
      userPayloadBuilder.setSplits(mrSplitsProto);
    }
    userPayloadBuilder.setGroupingEnabled(isGrouped);
    return UserPayload.create(userPayloadBuilder.build().toByteString().asReadOnlyByteBuffer());
  }


}
Source Code of org.apache.tez.mapreduce.hadoop.MRInputHelpers

Related Classes of org.apache.tez.mapreduce.hadoop.MRInputHelpers