Package protein

Source Code of protein.PureProteinAligner$ProteinAlignerOptions

package protein;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import protein.util.AlphabetMap;
import protein.util.Biology;

import comparison.AllPairsMapper;
import comparison.AllPairsReducer;
import comparison.ExhaustiveUniqueGrouper;
import comparison.PairSplit;
import comparison.PrimeRot;

/**
* Hadoop program which aligns all pairs of protein against one another.  A dynamic programming
* comparison function is used: either for {@link LocalAlignment} or {@link GlobalAlignment}.
*/
public class PureProteinAligner extends Configured implements Tool {

  private static final String SFA_META_FILE_EXT = ".sfm";
  private static final String SFA_FILE_EXT = ".sfa";

  private static final String SFA_DIR = "simple";

  private static final String[] inputExts = {".fa", ".fasta", ".fsa_aa"};

  /**
   * Identifies the different implementations of {@link ExhaustiveUniqueGrouper}s supported by
   * {@link PureProteinAligner}.
   */
  enum EugType {PRIME_ROT, PAIR_SPLIT}

  /*Configuration attribute names for grouping data.*/
  static final String NUM_SEQ_ATTR = "ns";
  static final String EUG_ATTR = "eug";
 
  /*Configuration attribute names for alignment parameters.*/
  static final String COMPARE_ATTR = "la";
  static final String ALPHABET_ATTR = "al";
  static final String MATCH_SCORE_ATTR = "ms";
  static final String MISMATCH_SCORE_ATTR = "mm";
  static final String GAP_SCORE_ATTR = "gap";
  static final String GAP_EXT_SCORE_ATTR = "gex";
  static final String KMER_LEN_ATTR = "kmr";
 
  /** Identifies the way in which two proteins should be compared. */
  enum CompareType {KMER, LOCAL, GLOBAL}
 
  /*Default alignment parameters.*/
  private static final CompareType DEF_COMPARE = CompareType.KMER;
  private static final String DEF_ALPHABET = Biology.alphabetStrA20;
  private static final int DEF_MATCH_SCORE = 10;
  private static final int DEF_MISMATCH_SCORE = -5;
  private static final int DEF_GAP_SCORE = -5;
  private static final int DEF_GAP_EXT_SCORE = -2;
  private static final int DEF_KMER_LEN = 3;
 
  /**
   * The mapper in {@link PureProteinAligner} does minimal work.
   *
   * 1) Parses the id from a sequence.
   * 2) Passes the id to an ExhaustiveUniqueGrouper.
   * 3) Uses the resulting group id as the output key, and the sequence as the output value.
   */
  public static class Map extends AllPairsMapper<LongWritable,Text> {

    private AlphabetMap alphabetMap;
    private ExhaustiveUniqueGrouper eug;

    @Override
    public void setup(Context context) throws IOException {
      Configuration conf = context.getConfiguration();
     
      long numSeq = conf.getLong(NUM_SEQ_ATTR, 0);
      int eugID = conf.getInt(EUG_ATTR, EugType.PRIME_ROT.ordinal());
      alphabetMap = new AlphabetMap(conf.getStrings(ALPHABET_ATTR, DEF_ALPHABET)[0]);
     
      if (numSeq < 0) {
        eug = null;
      } else {
       
        if (eugID ==  EugType.PRIME_ROT.ordinal()) {
          eug = PrimeRot.generatePrimeRot(numSeq);
       
        } else if (eugID == EugType.PRIME_ROT.ordinal()) {
         
          try {
            eug = new PairSplit(numSeq);
          } catch (IOException e) {
            eug = null;
          }

        } else {
          eug = PrimeRot.generatePrimeRot(numSeq);
        }
      }
    }
   
    @Override
    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      long id = parseId(value);
      String seq = value.toString();
      seq = seq.substring(seq.indexOf(' ') + 1);
     
      // Compress the protein.
      StringBuilder compressed = new StringBuilder();
      for (int i = 0; i < seq.length(); i++) {
        compressed.append(alphabetMap.get(seq.charAt(i)));
      }
     
      Text compressedSeq = new Text(">" + id + " " + compressed.toString());
      super.map(key, compressedSeq, context);
    }

    @Override
    protected long[] getGroups(long id) throws IOException {
      if (eug == null) {
        throw new IOException("Grouper was not properly initialized.");
      }
      return eug.getGroups(id);
    }

    @Override
    protected long parseId(Text value) throws IOException {
      long seqId = parseSeqId(value);

      if (seqId > eug.getNumItems()) {
        throw new IOException("Parsed a sequence id which is too high. Got " +
            seqId + ", should be less than " + eug.getNumItems() +
            ".\nTroublesome sequence:" + value);
      }

      return seqId;
    }
  }

  /**
   * The reducer performs the bulk of the work.
   *
   * For each pair of sequences:
   * 1) Parse the ids of both sequences.
   * 2) Align the two sequences to one another.
   * 3) Combine the ids to form the output key, use the alignment score as the output value.
   */
  public static class Reduce extends AllPairsReducer<Text, LongWritable> {

    private CompareType compareType;
    private int matchScore, mismatchScore, gapScore, gapExtScore;
    private int kmerLength;
   
    @Override
    public void setup(Context context) throws IOException {
      Configuration config = context.getConfiguration();
     
      /* Initialize the parameters for the alignment. */
      compareType =
        CompareType.values()[config.getInt(COMPARE_ATTR, DEF_COMPARE.ordinal())];
      matchScore = config.getInt(MATCH_SCORE_ATTR, DEF_MATCH_SCORE);
      mismatchScore = config.getInt(MISMATCH_SCORE_ATTR, DEF_MISMATCH_SCORE);
      gapScore = config.getInt(GAP_SCORE_ATTR, DEF_GAP_SCORE);
      gapExtScore = config.getInt(GAP_EXT_SCORE_ATTR, DEF_GAP_EXT_SCORE);
      kmerLength = config.getInt(KMER_LEN_ATTR, DEF_KMER_LEN);
    }
   
    @Override
    protected LongWritable compareItems(Text a, Text b) {
      Alignment alignment;
     
      switch (compareType) {
      case LOCAL:
        alignment = LocalAlignment.getLocalAlignment(
          a, b, matchScore, mismatchScore, gapScore, gapExtScore);
        break;
       
      case GLOBAL:
        alignment = GlobalAlignment.getGlobalAlignment(
          a, b, matchScore, mismatchScore, gapScore);
        break;
       
      case KMER:
        // Drop through.
      default:
        alignment = KmerDistance.getKmerAlignment(a, b, kmerLength);
      }

      return new LongWritable(alignment.getDistance());
    }

    @Override
    protected long parseId(Text value) throws IOException {
      return parseSeqId(value);
    }

    @Override
    protected Text copyValue(Text original) {
      return new Text(original);
    }
  }


  /** Handles runtime flags for the {@link PureProteinAligner}. */
  private static class ProteinAlignerOptions {

    private static final String FLAG_MARKER = "-";

    private boolean preprocess = true;
    private boolean convertOnly = false;

    private String baseName;
    private Path fastaPath;
    private Path inputPath;
    private Path outputPath;
   
    private int numReduceTasks = 114;

    public ProteinAlignerOptions(String[] args) throws IOException {

      int i = 0;

      if (args[i].startsWith(FLAG_MARKER)) {
        int toIncrement = 1;

        for (Character c : args[i].substring(1).toCharArray()) {

          switch(c) {

          /*ONLY convert to simple FASTA*/
          case 'c':
          case 'C':
            convertOnly = true;
            break;

          case 's':
          case 'S':
            preprocess = false;
            break;

            /*Specify a base name for this run.*/
          case 'b':
          case 'B':
            baseName = args[i + 1];
            toIncrement++;
            break;
           
          case 'r':
          case 'R':
            numReduceTasks = Integer.parseInt(args[i + 1]);
            toIncrement++;
            break;
           
          default:
            throw new IOException("Unrecognized flag: " + c + ".");
          }

          i+= toIncrement;
        }
      }

      if (i >= args.length - 1)
        throw new IOException("An input and output path are required.");

      fastaPath = new Path(args[i]);
      inputPath = new Path(args[i] + Path.SEPARATOR + SFA_DIR);
      outputPath = new Path(args[i+1]);

      if (baseName == null || baseName.equals(""))
        baseName = PureProteinAligner.computeDefaultBaseName(fastaPath);
    }

    public boolean isPreprocess() {
      return preprocess;
    }

    public boolean isConvertOnly() {
      return convertOnly;
    }

    public String getBaseName() {
      return baseName;
    }

    public Path getFastaPath() {
      return fastaPath;
    }

    public Path getInputPath() {
      return inputPath;
    }

    public Path getOutputPath() {
      return outputPath;
    }

    public int getNumReduceTasks() {
      return numReduceTasks;
    }
  }

  @Override
  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    ProteinAlignerOptions opts = new ProteinAlignerOptions(args);

    if (opts.isPreprocess()) {
      toSimpleFasta(opts.getFastaPath(), opts.getInputPath(), opts.getBaseName());
      System.out.println("Finished preprocessing successfully.");
    }

    if (!opts.isConvertOnly()) {

      Job job = new Job(getConf());
      job.setJarByClass(PureProteinAligner.class);
      job.setJobName("proteinAligner");

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(LongWritable.class);
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(Text.class);

      job.setMapperClass(Map.class);
      job.setReducerClass(Reduce.class);

      job.setInputFormatClass(TextInputFormat.class);
      job.setOutputFormatClass(TextOutputFormat.class);

      job.setNumReduceTasks(opts.getNumReduceTasks());
     
      FileInputFormat.setInputPaths(job, opts.getInputPath());
      FileOutputFormat.setOutputPath(job, opts.getOutputPath());

      setNumSequences(job, opts.getFastaPath(), opts.getBaseName());

      return job.waitForCompletion(true) ? 0 : 1;
    }
   
    return 0;
  }
 
  public static void main(String[] args) throws IOException {
    int result = 1;
   
    try {
      result = ToolRunner.run(new PureProteinAligner(), args);
     
    } catch (Exception e) {
      e.printStackTrace();
      System.out.println("Job failed.");
    }
    System.exit(result);
  }

  /**
   * Assume the text has been pre-processed in the following form:
   *     >123 GATTACA...
   *
   * Parse the sequence Id.
   */
  public static long parseSeqId(Text value) throws IOException {
   
    /*Read the sequence number*/
    int start = value.find(">");
    int end = value.find(" ") > 0 ? value.find(" ") : Integer.MAX_VALUE;
    end = Math.min(end, value.find("\t") > 0 ? value.find("\t") : Integer.MAX_VALUE);
    end = Math.min(end, value.find("\n") > 0 ? value.find("\n") : Integer.MAX_VALUE);
    String seqId = "";

    if (start < 0 || end == Integer.MAX_VALUE) {
      throw new IOException("Couldn't find " + (start < 0 ? "'>'" : "whitespace") + ".");
    }
   
    try {
      seqId = Text.decode(value.getBytes(), start + 1, end - 1);
      return Long.parseLong(seqId);

    } catch(NumberFormatException e) {
      throw new IOException("Couldn't parse seqID (" + seqId + ") from: " + value);
    } catch (CharacterCodingException e1) {
      throw new IOException("Encoding error parsing seqID from: " + value);
    }
  } 

  /** Retrieves the number of sequences in the input SFA file, from the associated metafile. */
  public static void setNumSequences(Job job, Path fastaPath, String baseName)
      throws IOException {
    FileSystem hdfs = FileSystem.get(job.getConfiguration());

    /*Find the associated meta-file*/
    Path metaPath = new Path(fastaPath + Path.SEPARATOR + baseName + SFA_META_FILE_EXT);

    if (!hdfs.exists(metaPath))
      throw new IOException("Expected to find meta-file at location " + metaPath + ".");

    /*Read the number of sequences*/
    BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(metaPath)));
    Long n = Long.parseLong(br.readLine().trim());
    br.close();
   
    /*Set the corresponding attribute in the JobConf*/
    job.getConfiguration().set(NUM_SEQ_ATTR, n.toString());
  }

  /**
   * Converts a FASTA file to the 'simple FASTA format', which is one sequence
   * per line with internal sequence IDs.  Also generates a meta-file which
   * maps iids -> external ids and keeps a count of the total number of
   * sequences in the FASTA file.
   *
   * If the meta-file has been generated more recently than the FASTA file
   * has been modified, do nothing.
   */
  public static void toSimpleFasta(Path srcPath, String baseName) throws IOException {
    Path defaultDestPath = new Path(srcPath + Path.SEPARATOR + SFA_DIR);
    toSimpleFasta(srcPath, defaultDestPath, baseName);
  }

  /**
   * Converts a FASTA file to the 'simple FASTA format', which is one sequence
   * per line with internal sequence IDs.  Also generates a meta-file which
   * maps iids -> external ids and keeps a count of the total number of
   * sequences in the FASTA file.
   *
   * If the meta-file has been generated more recently than the FASTA file
   * has been modified, do nothing.
   *
   * @param srcPath - Location of the directory containing the input files.
   * @param destPath - Location of the directory where the simple FASTA file
   *         should land.
   */
  public static void toSimpleFasta(Path srcPath, Path destPath, String baseName) throws IOException {

    FileSystem hdfs = FileSystem.get(new Configuration());

    if (!hdfs.exists(srcPath)) {
      throw new IOException(srcPath + " does not exist.");
    }

    for (int i = 0; i < inputExts.length; i++) {

      if (baseName.endsWith(inputExts[i])) {
        baseName = baseName.substring(0, baseName.length() - inputExts[i].length());
        break;
      }
    }

    /* If the directory doesn't exist, try to make it. */
    if (!hdfs.exists(destPath) && !hdfs.mkdirs(destPath)) {
      throw new IOException("Couldn't create directory at " + destPath + ".");

    } else if (!hdfs.getFileStatus(destPath).isDir()) {
      throw new IOException(destPath + " was a file. Expected a directory.");
    }


    /* Look for the associated simple FASTA file. */
    Path sfaPath = new Path(destPath + Path.SEPARATOR + baseName + SFA_FILE_EXT);

    if (!hdfs.exists(sfaPath) && !hdfs.createNewFile(sfaPath)) {
      throw new IOException(
          "Couldn't write " + SFA_FILE_EXT + " file to " + destPath + ".");
    }

    /* Look for the associated meta-file. */
    Path metaPath = new Path(srcPath + Path.SEPARATOR + baseName + SFA_META_FILE_EXT);
    boolean needRewrite = true;

    if (hdfs.exists(metaPath)) {
      /*TODO
      long metaModifyTime = hdfs.getFileStatus(metaPath).getModificationTime();

      /*Check if the sfa file has been updated more recently.

      /*Check if the meta-file was updated more recently than any of the FASTA files

      if (metaStatus.getModificationTime() > srcDir.lastModified())
        needRewrite = false;
       */

    } else if (!hdfs.createNewFile(metaPath)) {
      throw new IOException("Couldn't write meta-file to " + destPath + ".");
    }

    /* Rewrite the FASTA file to 'simple FASTA format'. */
    if (needRewrite) {

      BufferedWriter sfaOut = new BufferedWriter(new OutputStreamWriter(hdfs.create(sfaPath)));
      List<String> seqIds = new ArrayList<String>();
      String line;
      int iid = 0;

      for (FileStatus srcFileStatus : hdfs.listStatus(srcPath)) {
        if (srcFileStatus.isDir() || srcFileStatus.getPath().toString().endsWith(SFA_META_FILE_EXT))
          continue;

        BufferedReader faIn = new BufferedReader(new InputStreamReader(hdfs.open(srcFileStatus.getPath())));
        boolean firstLine = true;

        while((line = faIn.readLine()) != null) {

          /* If this line is a sequence id, convert to an internal id. */
          if (line.startsWith(">")) { 
            if (firstLine) {
              firstLine = false;
            } else {
              sfaOut.append("\n");
            }

            sfaOut.append(">" + iid + " ");
            int firstSpace = line.indexOf(" ");
            firstSpace = firstSpace > 0 ? firstSpace : Integer.MAX_VALUE;
           
            int firstTab = line.indexOf("\t");
            firstTab = firstTab > 0 ? firstTab : Integer.MAX_VALUE;
           
            int firstWhiteSpace = Math.min(firstSpace, firstTab);
            if (firstWhiteSpace < Integer.MAX_VALUE) {
              seqIds.add(iid + " " + line.substring(1, firstWhiteSpace));
              sfaOut.append(line.substring(firstWhiteSpace + 1));
             
            } else {
              seqIds.add(iid + " " + line.substring(1));
            }
            iid++;

            /* If its not a sequence id, write it on the same line. */ 
          } else {
            sfaOut.append(line);
          }
        }
        faIn.close();
      }
      sfaOut.close();
     
      BufferedWriter metaOut = new BufferedWriter(new OutputStreamWriter(hdfs.create(metaPath)));

      /* Write the total number of sequences at the beginning of the file. */
      metaOut.append(iid + "\n");

      /* Write the iid + seqId pairs. */
      for (String seqId : seqIds) {
        metaOut.append(seqId + "\n");
      }

      metaOut.close();
    }

    hdfs.close();
  }

  public static String computeDefaultBaseName(Path fastaPath) throws IOException {
    FileSystem hdfs = FileSystem.get(new Configuration());
    List<String> fileNames = new ArrayList<String>();

    /* Grab all of the file names in the given directory of FASTA files. */
    for (FileStatus fs : hdfs.listStatus(fastaPath)) {
      if (!fs.isDir()) {
        fileNames.add(fs.getPath().getName());
      }
    }
    hdfs.close();

    if (fileNames.isEmpty())
      throw new IOException("Expected input files in directory " + fastaPath + ".");

    /* Sort them for consistency between runs. */
    Collections.sort(fileNames);
    String baseName = fileNames.get(0);

    /* Trim any fasta file extensions. */
    for (String ext : inputExts) {
      if (baseName.endsWith(ext)) {
        baseName = baseName.substring(0, baseName.length() - ext.length());
        break;
      }
    }
    return baseName;
  }
}
TOP

Related Classes of protein.PureProteinAligner$ProteinAlignerOptions

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.