Source Code of cbcb.kmulus.blast.BlastMapper

package cbcb.kmulus.blast;


import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;


import cbcb.kmulus.util.Biology;
import cbcb.kmulus.util.PresenceVector;


/**
 * Mapper receives a chunk of input simple fasta sequences. For each k-mer seen in
 * the query sequence, it is checked against each clusters k-mers vector.  If
 * the query shares a k-mer in common with a cluster, the (cluster_id, >query_seq)
 * is emitted.
 */
public class BlastMapper extends Mapper<LongWritable, Text, LongWritable, Text> {


  private static final Logger LOG = Logger.getLogger(BlastMapper.class);


  private int alphabetSize;
  private int kmerLength;
  private int numCenters;
  
  // TODO(cmhill): Change to percentage since reads will be diff sizes.
  private int minKmerMatch;
  
  /** The {@link PresenceVector} for each given cluster center. */
  private PresenceVector[] clusterCenters;
 
  /** Allocate space for the cluster center vectors. */
  private void initializeCenters() {
    if (clusterCenters != null) {
      clusterCenters = null;
    }


    clusterCenters = new PresenceVector[numCenters];
  }


  /**
   * Load the cluster bit vectors in from the specified HDFS path.
   * 
   * @param clusterDir HDFS directory of cluster center bit vectors.
   * @return True if centers copied succesfully, false otherwise.
   * @throws IOException  If hdfs path related error.
   */
  private boolean loadClusters(Configuration conf) throws IOException {
    initializeCenters();
    
    // Load each cluster k-mer bit vectors into memory.
    String clusterDir = conf.get(Blast.CLUSTER_DIR);
    Path clusterPath = new Path(clusterDir);
    
    FileSystem fs = FileSystem.get(conf);
    
    int currCenter = 0;
    // Go through each part-r-* file in the clusterPath and
    // copy over their bit vectors.
    for (FileStatus srcFileStatus : fs.listStatus(clusterPath)) {
      if (srcFileStatus.isDir())
        continue;


      SequenceFile.Reader reader = new SequenceFile.Reader(fs,srcFileStatus.getPath(), conf);


      LongWritable key = new LongWritable();
      PresenceVector value = new PresenceVector();


      while (reader.next(key, value) != false) {
        if (key.toString().equals("")) {
          break;
        }


        clusterCenters[currCenter] = new PresenceVector(value);
        ++currCenter;
        key.set(0);
      }
      reader.close();
    }
    
    if (currCenter == numCenters) {
      return true;
      
    } else {
      LOG.info("Clusters expected: " + numCenters + ", actual clusters: " +
          currCenter);
      return false;
    }
  }
  
  @Override
  protected void setup(Context context) {
    Configuration conf = context.getConfiguration();
    kmerLength = conf.getInt(Blast.KMER_LENGTH, 3);
    alphabetSize = conf.getInt(Blast.ALPHABET_SIZE, Biology.AMINO_ACIDS.length);
    numCenters = conf.getInt(Blast.NUM_CENTERS, -1);
    minKmerMatch = conf.getInt(Blast.MIN_KMER_MATCH, 1);
    
    try {
      loadClusters(conf);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }


  public void map(LongWritable key, Text value, Context context) 
      throws IOException, InterruptedException {
    String headerAndSequence = value.toString().trim();
    
    // TODO(cmhill): More error checking.
    if (!headerAndSequence.startsWith(">")) {
      LOG.info("INPUT SEQUENCE ERROR: " + headerAndSequence);
      return;
    }
    
    String sequence = headerAndSequence.split(Blast.HEADER_SEQUENCE_SEPARATOR)[1];
    
    // Go through all k-mers of the sequence
    Set<Integer> clusters = new HashSet<Integer>();
    Set<Integer> overlappingClusters = new HashSet<Integer>();
    
    // Faster way than this?
    for (int i = 0; i < numCenters; i++) {
      clusters.add(i);
    }
    
    Map<Integer,Integer> kmerMatchCount = new HashMap<Integer, Integer>();
    
    for (int i = 0; i < sequence.length() - kmerLength; i++) {
      
      String kmer = sequence.substring(i, i + kmerLength);
      Set<Integer> kmerHashes = Biology.getNeighborKmerHashes(kmer, 0);
      for (int kmerHash : kmerHashes) {


        // Check which cluster centers overlap with this k-mer.
        for (Integer j : clusters) {  
          if (clusterCenters[j].containsKmer(kmerHash)) {
            if (kmerMatchCount.get(j) == null)
              kmerMatchCount.put(j, 1);
            else
              kmerMatchCount.put(j, kmerMatchCount.get(j) + 1);
            // overlappingClusters.add(j);
          }
        }
        
        // Remove all clusters we've already found overlaps for.
        // clusters.removeAll(overlappingClusters);
      }
    }
    
    // Emit all overlapping clusters.
    for (Integer clusterId : kmerMatchCount.keySet()) {
      if (kmerMatchCount.get(clusterId) >= minKmerMatch)
        context.write(new LongWritable(clusterId), value);
    }
    
    /* for (Integer clusterId : overlappingClusters) {
      context.write(new LongWritable(clusterId), value);
    } */
  }
}
Source Code of cbcb.kmulus.blast.BlastMapper

Related Classes of cbcb.kmulus.blast.BlastMapper