Package abra

Source Code of abra.NativeAssembler$Position

/* Copyright 2013 University of North Carolina at Chapel Hill.  All rights reserved. */
package abra;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;

import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader.ValidationStringency;

/**
* Handles regional assembly by invoking the native assembler.
*
* @author Lisle E. Mose (lmose at unc dot edu)
*/
public class NativeAssembler {
 
  //TODO: Calc dynamically
  private static final int MAX_READ_LENGTHS_PER_REGION = 6;
 
  public static final int CYCLE_KMER_LENGTH_THRESHOLD = 43;
 
  private static final int MIN_CANDIDATE_BASE_QUALITY = 10;
 
  private boolean truncateOnRepeat;
  private int maxContigs;
  private int maxPathsFromRoot;
  private int readLength;
  private int[] kmers;
  private int minKmerFrequency;
  private int minBaseQuality;
  private double minReadCandidateFraction;
  private Set<String> readIds;
  private int maxAverageDepth;
  List<Position> svCandidates = new ArrayList<Position>();
  List<BreakpointCandidate> svCandidateRegions = new ArrayList<BreakpointCandidate>();
  private boolean shouldSearchForSv = false;
  private boolean isCycleExceedingThresholdDetected = false;
  private int averageDepthCeiling;

  private native String assemble(String input, String output, String prefix, int truncateOnRepeat, int maxContigs, int maxPathsFromRoot, int readLength, int kmerSize, int minKmerFreq, int minBaseQuality);
 
  private String getIdentifier(SAMRecord read) {
    String id = read.getReadName();
   
    if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
      id += "_2";
    }
   
    return id;
  }
 
  private boolean isHardClipped(SAMRecord read) {
    return read.getCigarString().contains("H");
  }
 
  private void filterPositionList(List<Integer> positions, int currentPos) {
    Iterator<Integer> iter = positions.iterator();
    while (iter.hasNext()) {
      if (iter.next() < currentPos-readLength) {
        iter.remove();
      }
    }
  }
 
  //
  //  Require at lest <fraction> number of candidate reads per average region depth
  //  inclusive of overlapping reads.
  private int minCandidateCount(int numReads, Feature region) {
    double fraction = minReadCandidateFraction;
   
    int minCount = (int) ((double) numReads / readLengthsPerRegion(region) * fraction);
   
    // Always require at least 2 candidate reads.
    return Math.max(minCount, 2);
  }
 
  // Calc number of read lengths per region, padding by 2 to account for reads overlapping region ends.
  private double readLengthsPerRegion(Feature region) {
    return (double) region.getLength() / (double) readLength + 2;
  }
 
  private double readLengthsForAllRegions(List<Feature> regions) {
    double lengths = 0;
    for (Feature region : regions) {
      lengths += readLengthsPerRegion(region);
    }
    return lengths;
  }
 
  //
  //  Calc desired number of reads per file.
  private int desiredNumberOfReads(List<Feature> regions) {
    return (int) (readLengthsForAllRegions(regions) * (double) maxAverageDepth);
  }
 
  private int maxNumberOfReadsPerSample(List<Feature> regions) {
    return (int) (readLengthsForAllRegions(regions) * (double) averageDepthCeiling);
  }
 
  private boolean isAssemblyTriggerCandidate(SAMRecord read, CompareToReference2 c2r) {
    boolean isCandidate = false;
   
    // Increment candidate count for indels
    if (read.getCigarString().contains("I") || read.getCigarString().contains("D")) {
      isCandidate = true;
    }

    // Increment candidate count for substantial high quality soft clipping
    // TODO: Check for chimera directly?
    if (read.getCigarString().contains("S")) {
      if (c2r.numHighQualityMismatches(read, MIN_CANDIDATE_BASE_QUALITY) > (readLength/10)) {
        isCandidate = true;
      }
    }
   
    // Increment candidate count if read contains at least 3 high quality mismatches
    if (SAMRecordUtils.getIntAttribute(read, "NM") >= 3) {
      if (c2r.numHighQualityMismatches(read, MIN_CANDIDATE_BASE_QUALITY) > 3) {
        isCandidate = true;
      }
    }

    return isCandidate;
  }
 
  public String simpleAssemble(List<SAMRecord> reads) {
   
    StringBuffer readBuffer = new StringBuffer();
   
    for (SAMRecord read : reads) {
      readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");
     
      if (read.getReadString().length() == readLength) {
        readBuffer.append(read.getReadString());
        readBuffer.append(read.getBaseQualityString());
      } else {
        StringBuffer basePadding = new StringBuffer();
        StringBuffer qualPadding = new StringBuffer();
       
        for (int i=0; i<readLength-read.getReadString().length(); i++) {
          basePadding.append('N');
          qualPadding.append('!');
        }
       
        readBuffer.append(read.getReadString() + basePadding.toString());
        readBuffer.append(read.getBaseQualityString() + qualPadding.toString());             
      }
    }
   


    SAMRecord lastRead = reads.get(reads.size()-1);
    int regionStart = reads.get(0).getAlignmentStart();
    int regionEnd = lastRead.getAlignmentEnd() > 0 ? lastRead.getAlignmentEnd() : lastRead.getAlignmentStart();
   
    String output = "region_" + reads.get(0).getReferenceName() + "_" + regionStart + "_" + regionEnd;
    String contigs = "";
   
    // Make this set of reads eligible for GC
//    reads.clear();
   
    for (int kmer : kmers) {
     
      String outputFile = output + "_k" + kmer;
     
      contigs = assemble(
          readBuffer.toString(),
          outputFile,
          output,
          1, // truncate_on_repeat
          maxContigs,
          maxPathsFromRoot,
          readLength,
          kmer,
          minKmerFrequency,
          minBaseQuality);
     
      if (!contigs.equals("<REPEAT>")) {
        break;
      }
    }

    return contigs;
  }
 
  public String assembleContigs(List<String> inputFiles, String output, String tempDir, List<Feature> regions, String prefix,
      boolean checkForDupes, ReAligner realigner, CompareToReference2 c2r) {
   
    StringBuffer buf = new StringBuffer();
   
    if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) {
      KmerSizeEvaluator kmerEval = new KmerSizeEvaluator();
      int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions);
      this.kmers = realigner.toKmerArray(kmer, readLength);
      buf.append("Dynamic -- ");
    } else {
      buf.append("Predefined -- ");
    }
   
    for (Feature region : regions) {
      buf.append(region.getDescriptor());
      buf.append(" ");
    }
    for (int k : kmers) {
      buf.append(k);
      buf.append(',');
    }

    System.out.println(buf.toString());
   
    String contigs = "";
   
    long start = System.currentTimeMillis();
   
    int readCount = 0;
   
    int minReadCount = Integer.MAX_VALUE;
   
    int maxReadsPerSample = maxNumberOfReadsPerSample(regions);
   
    boolean isMaxReadsForRegionExceeded = false;

    // if c2r is null, this is the unaligned region.
    boolean isAssemblyCandidate = c2r == null ? true : false;
   
    try {
     
      List<List<SAMRecord>> readsList = new ArrayList<List<SAMRecord>>();

      for (String input : inputFiles) {
        readIds = new HashSet<String>();
        List<SAMRecord> reads = new ArrayList<SAMRecord>();
        readsList.add(reads);
       
        for (Feature region : regions) {
          SAMFileReader reader = new SAMFileReader(new File(input));
          reader.setValidationStringency(ValidationStringency.SILENT);
         
          Iterator<SAMRecord> iter;
          if (region != null) {
            iter = reader.queryOverlapping(region.getSeqname(), (int) region.getStart(), (int) region.getEnd());
          } else {
            iter = reader.iterator();
          }
         
          int candidateReadCount = 0;
         
          while (iter.hasNext() && !isMaxReadsForRegionExceeded) {
           
            SAMRecord read = iter.next();
            readCount++;
                       
            // Don't allow same read to be counted twice.
            if ( (!realigner.isFiltered(read)) &&
               (!read.getDuplicateReadFlag()) &&
               (!read.getReadFailsVendorQualityCheckFlag()) &&
               (read.getMappingQuality() >= realigner.getMinMappingQuality() || read.getReadUnmappedFlag()) &&
               //(Sam2Fastq.isPrimary(read)) &&
//               (!isHardClipped(read)) &&
               ((!checkForDupes) || (!readIds.contains(getIdentifier(read))))) {
             
              if (read.getReadString().length() > readLength) {
                reader.close();
                throw new IllegalArgumentException("Maximum read length of: " + readLength +
                    " exceeded for: " + read.getSAMString());
              }
 
              Integer numBestHits = (Integer) read.getIntegerAttribute("X0");
              boolean hasAmbiguousInitialAlignment = numBestHits != null && numBestHits > 1;
             
              if (!hasAmbiguousInitialAlignment) {
                if (!checkForDupes) {
                  readIds.add(getIdentifier(read));
                }
               
                if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) {
                  candidateReadCount++;
                }
               
                reads.add(read);
               
                if (shouldSearchForSv && isSvCandidate(read, region)) {
                  svCandidates.add(new Position(read.getMateReferenceName(), read.getMateAlignmentStart(), input));
                }
               
                if (reads.size() > maxReadsPerSample) {
                  isMaxReadsForRegionExceeded = true;
                  break;
                }
              }
            }
          }
         
          reader.close();
         
          if (candidateReadCount > minCandidateCount(reads.size(), region)) {
            isAssemblyCandidate = true;
          }
        }
       
        if (reads.size() < minReadCount) {
          minReadCount = reads.size();
        }
      }
     
      if (isMaxReadsForRegionExceeded) {
        isAssemblyCandidate = false;
        shouldSearchForSv = false;
        System.out.println("Max reads exceeded for region: " + prefix);
      }
     
      readIds = null;
     
      StringBuffer readBuffer = new StringBuffer();
     
      if (isAssemblyCandidate) {
       
        int downsampleTarget = desiredNumberOfReads(regions);
       
        for (List<SAMRecord> reads : readsList) {
          // Default to always keep
          double keepProbability = 1.1;
         
          if (reads.size() > downsampleTarget) {
            keepProbability = (double) downsampleTarget / (double) reads.size();
          }
         
          Random random = new Random(1);
         
          for (SAMRecord read : reads) {
            if (random.nextDouble() < keepProbability) {
              readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");
             
              if (read.getReadString().length() == readLength) {
                readBuffer.append(read.getReadString());
                readBuffer.append(read.getBaseQualityString());
              } else {
                StringBuffer basePadding = new StringBuffer();
                StringBuffer qualPadding = new StringBuffer();
               
                for (int i=0; i<readLength-read.getReadString().length(); i++) {
                  basePadding.append('N');
                  qualPadding.append('!');
                }
               
                readBuffer.append(read.getReadString() + basePadding.toString());
                readBuffer.append(read.getBaseQualityString() + qualPadding.toString());             
              }
            }
          }
         
          // Make this set of reads eligible for GC
          reads.clear();
        }
      }
     
      readsList.clear();
     
      long end1 = System.currentTimeMillis();
     
      System.out.println("Elapsed msecs collection data to assemble" + (end1-start));
     
      if (isAssemblyCandidate) {
        for (int kmer : kmers) {
       
          String outputFile = output + "_k" + kmer;
         
          contigs = assemble(
              readBuffer.toString(),
              outputFile,
              prefix,
              truncateOnRepeat ? 1 : 0,
              maxContigs,
              maxPathsFromRoot,
              readLength,
              kmer,
              minKmerFrequency,
              minBaseQuality);
         
          if (!contigs.equals("<REPEAT>")) {
            break;
          } else {
            if (kmer >= readLength/2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) {
              isCycleExceedingThresholdDetected = true;
            }
          }
        }
      } else {
        System.out.println("Skipping assembly for: " + prefix);
      }

    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
   
    if (this.shouldSearchForSv) {
     
      Collections.sort(this.svCandidates);
      Position last = null;
      String currentFeatureChr = null;
      int currentFeatureStart = -1;
      int currentFeatureStop = -1;
      int currentFeatureCount = 0;
     
      // TODO: Calc this dynamically
      int windowSize = 500;
     
      for (Position pos : this.svCandidates) {
        if ((last != null) && pos.getChromosome().equals(last.getChromosome()) &&
           Math.abs(pos.getPosition()-last.getPosition()) < windowSize) {
         
          if (currentFeatureChr == null) {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = last.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          } else {
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount++;
          }
        } else {
          if (currentFeatureChr != null) {
            if (currentFeatureCount > (minReadCount/MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
              Feature region = new Feature(currentFeatureChr, currentFeatureStart-readLength, currentFeatureStop+readLength);
              BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
              this.svCandidateRegions.add(candidate);
            }
            currentFeatureChr = null;
            currentFeatureStart = -1;
            currentFeatureStop = -1;
            currentFeatureCount = 0;
          } else {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = pos.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          }
        }
        last = pos;
      }
    }
   
    long end = System.currentTimeMillis();
   
    System.out.println("Elapsed_msecs_in_NativeAssembler\tRegion:\t" + regions.get(0).getDescriptor() + "\tLength:\t" + regions.get(0).getLength() + "\tReadCount:\t" + readCount + "\tElapsed\t" + (end-start) + "\tAssembled\t" + isAssemblyCandidate);
   
    return contigs;
  }
 
  String nativeAssemble(String input, String output, String prefix, int truncateOnRepeat, int maxContigs, int maxPathsFromRoot, int readLength, int[] kmers, int minKmerFreq, int minBaseQuality) {
    String result = "";
    for (int kmer : kmers) {
      result = assemble(input, output, prefix, truncateOnRepeat, maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFreq, minBaseQuality);
      if (!result.equals("<REPEAT>")) {
        break;
      }
    }
    return result;
  }
 
  private boolean isSvCandidate(SAMRecord read, Feature region) {
    boolean isCandidate = false;
    if (!read.getProperPairFlag() && !read.getMateUnmappedFlag()) {
      if (!read.getReferenceName().equals(read.getMateReferenceName())) {
        isCandidate = true;
      } else if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > CombineChimera3.MAX_GAP_LENGTH &&
          !region.overlaps(read.getMateReferenceName(), read.getMateAlignmentStart()-(int)region.getLength(), read.getMateAlignmentStart()+read.getReadLength()+(int)region.getLength())) {
        isCandidate = true;
      }
    }
    return isCandidate;
  }
 
  public boolean shouldSearchForSv() {
    return shouldSearchForSv;
  }

  public void setShouldSearchForSv(boolean shouldSearchForSv) {
    this.shouldSearchForSv = shouldSearchForSv;
  }
 
  public List<BreakpointCandidate> getSvCandidateRegions() {
    return this.svCandidateRegions;
  }
 
  private boolean hasLowQualityBase(SAMRecord read) {
    //TODO: Don't hardcode phred33
    for (int i=0; i<read.getBaseQualityString().length(); i++) {
      if ((read.getBaseQualityString().charAt(i) - '!') < 20) {
        return true;
      }
    }
   
    return false;
  }

  public boolean isTruncateOnRepeat() {
    return truncateOnRepeat;
  }

  public void setTruncateOutputOnRepeat(boolean truncateOnRepeat) {
    this.truncateOnRepeat = truncateOnRepeat;
  }

  public int getMaxContigs() {
    return maxContigs;
  }

  public void setMaxContigs(int maxContigs) {
    this.maxContigs = maxContigs;
  }

  public int getMaxPathsFromRoot() {
    return maxPathsFromRoot;
  }

  public void setMaxPathsFromRoot(int maxPathsFromRoot) {
    this.maxPathsFromRoot = maxPathsFromRoot;
  }
 
  public void setReadLength(int readLength) {
    this.readLength = readLength;
  }
 
  public void setKmer(int[] kmers) {
    this.kmers = kmers;
  }
 
  public void setMinKmerFrequency(int frequency) {
    this.minKmerFrequency = frequency;
  }
 
  public void setMinBaseQuality(int minBaseQuality) {
    this.minBaseQuality = minBaseQuality;
  }
 
  public void setMaxAverageDepth(int maxAverageDepth) {
    this.maxAverageDepth = maxAverageDepth;
  }
 
  public void setMinReadCandidateFraction(double minReadCandidateFraction) {
    this.minReadCandidateFraction = minReadCandidateFraction;
  }
 
  public void setAverageDepthCeiling(int averageDepthCeiling) {
    this.averageDepthCeiling = averageDepthCeiling;
  }
 
  public boolean isCycleExceedingThresholdDetected() {
    return isCycleExceedingThresholdDetected;
  }
 
  static class Position implements Comparable<Position> {
    private String chromosome;
    private int position;
    private String inputFile;
   
    Position(String chromosome, int position, String inputFile) {
      this.chromosome = chromosome;
      this.position = position;
      this.inputFile = inputFile;
    }
   
    String getChromosome() {
      return chromosome;
    }
   
    int getPosition() {
      return position;
    }
   
    public String toString() {
      return chromosome + ":" + position;
    }

    @Override
    public int compareTo(Position that) {
      int compare = this.chromosome.compareTo(that.chromosome);
      if (compare == 0) {
        compare = this.position - that.position;
      }
      return compare;
    }
  }
 
  public static void main(String[] args) throws Exception {
   
    NativeLibraryLoader l = new NativeLibraryLoader();
    l.load("/home/lmose/code/abra/target");
   
    NativeAssembler assem = new NativeAssembler();
    assem.setTruncateOutputOnRepeat(true);
    assem.setMaxContigs(5000);
    assem.setMaxPathsFromRoot(5000);
    assem.setKmer(new int[] { 43 });
    assem.setReadLength(100);
    assem.setMinKmerFrequency(2);
    assem.setMaxAverageDepth(400);
    assem.setShouldSearchForSv(true);
   
//    String bam1 = args[0];
    String bam1 = "/home/lmose/dev/abra/sv/test.bam";
    List<String> inputFiles = new ArrayList<String>();
    inputFiles.add(bam1);
   
    //String output = args[2];
    String output = "/home/lmose/dev/abra/sv/output.txt";
    //chr18:60,793,358-60,793,758
    Feature region = new Feature("chr18", 60793358, 60793758);
    String prefix = "pre";
    boolean checkForDupes = true;
    ReAligner realigner = new ReAligner();
    CompareToReference2 c2r = new CompareToReference2();
    //c2r.init(args[3]);
    c2r.init("/home/lmose/reference/chr18/chr18.fa");
   
    List<Feature> regions = new ArrayList<Feature>();
    regions.add(region);
    String contigs = assem.assembleContigs(inputFiles, output, "asm_temp", regions, prefix, checkForDupes, realigner, c2r);
    System.out.println(contigs);
   
    System.out.println("-------------------------");
   
    List<BreakpointCandidate> svCandidates = assem.getSvCandidateRegions();
    for (BreakpointCandidate svCandidate : svCandidates) {
      System.out.println("SV: " + region.getDescriptor() + "-->" + svCandidate.getRegion().getDescriptor());
    }
   
//    assem.assembleContigs(args[0], args[1], "contig");
   
//    for (int i=0; i<10; i++) {
//      run(args[0], args[1] + "_" + i);
//    }
   
//    run(args[0], args[1]);
   
//    assem.assembleContigs("/home/lmose/code/abra/src/main/c/1810_reads.txt",
//        "/home/lmose/code/abra/src/main/c/1810.fa", "bar");
  }

 
}
TOP

Related Classes of abra.NativeAssembler$Position

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.