Package org.apache.hadoop.mapreduce.lib.input

Examples of org.apache.hadoop.mapreduce.lib.input.TextInputFormat


  @Override
  protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
   
    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
   
    int nbSplits = splits.size();
    log.debug("Nb splits : " + nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    firstIds = new int[nbSplits];
    sizes = new int[nbSplits];
   
    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (int p = 0; p < nbSplits; p++) {
      InputSplit split = splits.get(p);
      int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition
     
      RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
      reader.initialize(split, task);
     
      Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(),
          hp, nbSplits, numTrees);
View Full Code Here


  protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback)
      throws IOException, InterruptedException {
    JobContext jobContext = new JobContext(conf, new JobID());
   
    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(jobContext);
   
    int nbSplits = splits.size();
    log.debug("Nb splits : " + nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < nbSplits; p++) {
      total += Step2Mapper.nbConcerned(nbSplits, numTrees, p);
    }

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees);
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < nbSplits; partition++) {
     
      InputSplit split = sorted[partition];
      RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);

      // load the output of the 1st step
      int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition);
      TreeID[] fsKeys = new TreeID[nbConcerned];
      Node[] fsTrees = new Node[nbConcerned];
View Full Code Here

    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, numMaps);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(numMaps, splits.size());

    InputSplit[] sorted = new InputSplit[numMaps];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    context = new Step0Context(new Step0Mapper(), job.getConfiguration(),
        new TaskAttemptID(), numMaps);

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];

      RecordReader<LongWritable, Text> reader = input.createRecordReader(split,
          context);
      reader.initialize(split, context);

      Step0Mapper mapper = new Step0Mapper();
      mapper.configure(p);
View Full Code Here

    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, numMaps);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(numMaps, splits.size());

    InputSplit[] sorted = new InputSplit[numMaps];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    List<Integer> keys = new ArrayList<Integer>();
    List<Step0Output> values = new ArrayList<Step0Output>();

    int[] expectedIds = new int[numMaps];

    TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(),
        new TaskAttemptID());

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.createRecordReader(split,
          context);
      reader.initialize(split, context);

      Long firstKey = null;
      int size = 0;
View Full Code Here

            } else {
                result = new PigAvroInputFormat(
                        inputAvroSchema, ignoreBadFiles, schemaToMergedSchemaMap);
            }
        } else {
            result = new TextInputFormat();
        }
        return result;
    }
View Full Code Here

     * Methods called on the frontend
     */

    @Override
    public InputFormat getInputFormat() throws IOException {
        return new TextInputFormat();
    }
View Full Code Here

        InputFormat result = null;
        if(inputAvroSchema != null) {
            result = new PigAvroInputFormat(
            inputAvroSchema, ignoreBadFiles, schemaToMergedSchemaMap, useMultipleSchemas);
        } else {
            result = new TextInputFormat();
        }
        return result;
    }
View Full Code Here

  @Override
  protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
   
    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
   
    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    firstIds = new int[nbSplits];
    sizes = new int[nbSplits];
   
    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (InputSplit split : splits) {
      int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

      RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
      reader.initialize(split, task);

      Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(),
                                               hp, nbSplits, numTrees);
View Full Code Here

  protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback)
      throws IOException, InterruptedException {
    JobContext jobContext = new JobContext(conf, new JobID());
   
    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(jobContext);
   
    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < nbSplits; p++) {
      total += Step2Mapper.nbConcerned(nbSplits, numTrees, p);
    }

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees);
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < nbSplits; partition++) {
     
      InputSplit split = sorted[partition];
      RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);

      // load the output of the 1st step
      int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition);
      TreeID[] fsKeys = new TreeID[nbConcerned];
      Node[] fsTrees = new Node[nbConcerned];
View Full Code Here

    public InputFormat getInputFormat() throws IOException {
        AvroStorageLog.funcCall("getInputFormat");
        if(inputAvroSchema != null)
            return new PigAvroInputFormat(inputAvroSchema);
        else
            return new TextInputFormat();
    }
View Full Code Here

TOP

Related Classes of org.apache.hadoop.mapreduce.lib.input.TextInputFormat

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.