Source Code of eu.stratosphere.nephele.template.AbstractFileInputTask

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/


package eu.stratosphere.nephele.template;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;


import eu.stratosphere.core.fs.BlockLocation;
import eu.stratosphere.core.fs.FileInputSplit;
import eu.stratosphere.core.fs.FileStatus;
import eu.stratosphere.core.fs.FileSystem;
import eu.stratosphere.core.fs.Path;


/**
 * Specialized subtype of {@link AbstractInputTask} for tasks which are supposed to generate input from
 * a file. In addition to {@link AbstractInputTask} this class includes a method to query file splits
 * which should be read during the task's execution.
 * 
 */
public abstract class AbstractFileInputTask extends AbstractInputTask<FileInputSplit> {


  public static final String INPUT_PATH_CONFIG_KEY = "input.path";


  /**
   * The fraction that the last split may be larger than the others.
   */
  private static final float MAX_SPLIT_SIZE_DISCREPANCY = 1.1f;


  // --------------------------------------------------------------------------------------------


  /**
   * Returns an iterator to a (possible empty) list of file input splits which is expected to be consumed by this
   * instance of the {@link AbstractFileInputTask}.
   * 
   * @return an iterator to a (possible empty) list of file input splits.
   */
  public Iterator<FileInputSplit> getFileInputSplits() {


    return new InputSplitIterator<FileInputSplit>(getEnvironment().getInputSplitProvider());
  }




  @Override
  public FileInputSplit[] computeInputSplits(final int minNumSplits) throws IOException {


    final String pathURI = getTaskConfiguration().getString(INPUT_PATH_CONFIG_KEY, null);
    if (pathURI == null) {
      throw new IOException("The path to the file was not found in the runtime configuration.");
    }


    final Path path;
    try {
      path = new Path(pathURI);
    } catch (Exception iaex) {
      throw new IOException("Invalid file path specifier: ", iaex);
    }


    final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>();


    // get all the files that are involved in the splits
    final List<FileStatus> files = new ArrayList<FileStatus>();
    long totalLength = 0;


    final FileSystem fs = path.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(path);


    if (pathFile.isDir()) {
      // input is directory. list all contained files
      final FileStatus[] dir = fs.listStatus(path);
      for (int i = 0; i < dir.length; i++) {
        if (!dir[i].isDir()) {
          files.add(dir[i]);
          totalLength += dir[i].getLen();
        }
      }


    } else {
      files.add(pathFile);
      totalLength += pathFile.getLen();
    }


    final long minSplitSize = 1;
    final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits +
          (totalLength % minNumSplits == 0 ? 0 : 1));


    // now that we have the files, generate the splits
    int splitNum = 0;
    for (final FileStatus file : files) {


      final long len = file.getLen();
      final long blockSize = file.getBlockSize();


      final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
      final long halfSplit = splitSize >>> 1;


      final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY);


      if (len > 0) {


        // get the block locations and make sure they are in order with respect to their offset
        final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
        Arrays.sort(blocks);


        long bytesUnassigned = len;
        long position = 0;


        int blockIndex = 0;


        while (bytesUnassigned > maxBytesForLastSplit) {
          // get the block containing the majority of the data
          blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
          // create a new split
          final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize,
            blocks[blockIndex]
              .getHosts());
          inputSplits.add(fis);


          // adjust the positions
          position += splitSize;
          bytesUnassigned -= splitSize;
        }


        // assign the last split
        if (bytesUnassigned > 0) {
          blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex);
          final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position,
            bytesUnassigned,
            blocks[blockIndex].getHosts());
          inputSplits.add(fis);
        }
      } else {
        // special case with a file of zero bytes size
        final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0);
        String[] hosts;
        if (blocks.length > 0) {
          hosts = blocks[0].getHosts();
        } else {
          hosts = new String[0];
        }
        final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts);
        inputSplits.add(fis);
      }
    }


    return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
  }


  /**
   * Retrieves the index of the <tt>BlockLocation</tt> that contains the part of the file described by the given
   * offset.
   * 
   * @param blocks
   *        The different blocks of the file. Must be ordered by their offset.
   * @param offset
   *        The offset of the position in the file.
   * @param startIndex
   *        The earliest index to look at.
   * @return The index of the block containing the given position.
   */
  private final int getBlockIndexForPosition(final BlockLocation[] blocks, final long offset,
      final long halfSplitSize, final int startIndex) {
    
    // go over all indexes after the startIndex
    for (int i = startIndex; i < blocks.length; i++) {
      long blockStart = blocks[i].getOffset();
      long blockEnd = blockStart + blocks[i].getLength();


      if (offset >= blockStart && offset < blockEnd) {
        // got the block where the split starts
        // check if the next block contains more than this one does
        if (i < blocks.length - 1 && blockEnd - offset < halfSplitSize) {
          return i + 1;
        } else {
          return i;
        }
      }
    }
    throw new IllegalArgumentException("The given offset is not contained in the any block.");
  }




  @Override
  public Class<FileInputSplit> getInputSplitType() {


    return FileInputSplit.class;
  }
}
Source Code of eu.stratosphere.nephele.template.AbstractFileInputTask

Related Classes of eu.stratosphere.nephele.template.AbstractFileInputTask