Package com.google.appengine.tools.mapreduce.impl

Source Code of com.google.appengine.tools.mapreduce.impl.FilesByShard

package com.google.appengine.tools.mapreduce.impl;

import com.google.appengine.tools.mapreduce.GoogleCloudStorageFileSet;
import com.google.appengine.tools.mapreduce.impl.util.SplitUtil;
import com.google.common.base.Preconditions;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;


/**
* A convenience datastructure to track sets of files on a per-shard basis
*/
public final class FilesByShard implements Serializable {

  private static final long serialVersionUID = -4160169134959332304L;
  private final String bucket;
  private List<List<String>> allFiles;
  private int shardCount;

  public FilesByShard(int shardCount, String bucket) {
    Preconditions.checkArgument(shardCount > 0);
    this.shardCount = shardCount;
    this.bucket = Preconditions.checkNotNull(bucket);
    this.allFiles = initAllFiles(shardCount);
  }

  public GoogleCloudStorageFileSet getFilesForShard(int shardNumber) {
    Preconditions.checkArgument(shardNumber < shardCount);
    List<String> fileNames = allFiles.get(shardNumber);
    return new GoogleCloudStorageFileSet(getBucket(), fileNames);
  }

  @Override
  public int hashCode() {
    return Objects.hash(allFiles, bucket, shardCount);
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    }
    if (obj == null) {
      return false;
    }
    if (getClass() != obj.getClass()) {
      return false;
    }
    FilesByShard other = (FilesByShard) obj;
    return Objects.equals(allFiles, other.allFiles)
        && Objects.equals(bucket, other.bucket)
        && Objects.equals(shardCount, other.shardCount);
  }

  public void addFilesToShard(int shardNumber, List<String> newFiles) {
    Preconditions.checkArgument(shardNumber < shardCount);
    List<String> fileNames = allFiles.get(shardNumber);
    fileNames.addAll(newFiles);
  }

  public void addFileToShard(int shardNumber, String newFile) {
    Preconditions.checkArgument(shardNumber < shardCount);
    List<String> fileNames = allFiles.get(shardNumber);
    fileNames.add(newFile);
  }

  public int getShardCount() {
    return shardCount;
  }

  @Override
  public String toString() {
    return "FilesByShard [bucket=" + getBucket() + ", allFiles=" + allFiles + ", shardCount="
        + shardCount + "]";
  }

  public String getBucket() {
    return bucket;
  }

  /**
   * Splits the provided input if needed to ensure there are approximately {@code targetNumShards}
   * shards. (Note that it will not combine files from different input shards into a single output
   * shard)
   */
  public void splitShards(int targetNumShards) {
    int origionalShardCount = getShardCount();
    int splitFactor = targetNumShards / origionalShardCount;
    if (splitFactor <= 1) {
      return;
    }
    List<List<String>> oldAllFiles = allFiles;
    shardCount = splitFactor * origionalShardCount;
    allFiles = initAllFiles(shardCount);
    int shard = 0;
    for (List<String> files : oldAllFiles) {
      if (!files.isEmpty()) {
        for (List<String> forShard : SplitUtil.split(files, splitFactor, false)) {
          addFilesToShard(shard++, forShard);
        }
      }
    }
  }

  private static ArrayList<List<String>> initAllFiles(int shardCount) {
    ArrayList<List<String>> result = new ArrayList<>(shardCount);
    for (int i = 0; i < shardCount; i++) {
      result.add(new ArrayList<String>(1));
    }
    return result;
  }
}
TOP

Related Classes of com.google.appengine.tools.mapreduce.impl.FilesByShard

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.