Package org.broadinstitute.gatk.utils.sam

Source Code of org.broadinstitute.gatk.utils.sam.ReadUtils

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.sam;

import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;

import java.io.File;
import java.util.*;

/**
* A miscellaneous collection of utilities for working with SAM files, headers, etc.
* Static methods only, please.
*
* @author mhanna
* @version 0.1
*/
public class ReadUtils {
    private final static Logger logger = Logger.getLogger(ReadUtils.class);
   
    private static final String OFFSET_OUT_OF_BOUNDS_EXCEPTION = "Offset cannot be greater than read length %d : %d";
    private static final String OFFSET_NOT_ZERO_EXCEPTION = "We ran past the end of the read and never found the offset, something went wrong!";
   
    private ReadUtils() {
    }

    private static final int DEFAULT_ADAPTOR_SIZE = 100;
    public static final int CLIPPING_GOAL_NOT_REACHED = -1;

    /**
     * A marker to tell which end of the read has been clipped
     */
    public enum ClippingTail {
        LEFT_TAIL,
        RIGHT_TAIL
    }

    /**
     * A HashMap of the SAM spec read flag names
     *
     * Note: This is not being used right now, but can be useful in the future
     */
    private static final Map<Integer, String> readFlagNames = new HashMap<Integer, String>();

    static {
        readFlagNames.put(0x1, "Paired");
        readFlagNames.put(0x2, "Proper");
        readFlagNames.put(0x4, "Unmapped");
        readFlagNames.put(0x8, "MateUnmapped");
        readFlagNames.put(0x10, "Forward");
        //readFlagNames.put(0x20, "MateForward");
        readFlagNames.put(0x40, "FirstOfPair");
        readFlagNames.put(0x80, "SecondOfPair");
        readFlagNames.put(0x100, "NotPrimary");
        readFlagNames.put(0x200, "NON-PF");
        readFlagNames.put(0x400, "Duplicate");
    }

    /**
     * This enum represents all the different ways in which a read can overlap an interval.
     *
     * NO_OVERLAP_CONTIG:
     * read and interval are in different contigs.
     *
     * NO_OVERLAP_LEFT:
     * the read does not overlap the interval.
     *
     *                        |----------------| (interval)
     *   <---------------->                      (read)
     *
     * NO_OVERLAP_RIGHT:
     * the read does not overlap the interval.
     *
     *   |----------------|                      (interval)
     *                        <----------------> (read)
     *
     * OVERLAP_LEFT:
     * the read starts before the beginning of the interval but ends inside of it
     *
     *          |----------------| (interval)
     *   <---------------->        (read)
     *
     * OVERLAP_RIGHT:
     * the read starts inside the interval but ends outside of it
     *
     *   |----------------|     (interval)
     *       <----------------> (read)
     *
     * OVERLAP_LEFT_AND_RIGHT:
     * the read starts before the interval and ends after the interval
     *
     *      |-----------|     (interval)
     *  <-------------------> (read)
     *
     * OVERLAP_CONTAINED:
     * the read starts and ends inside the interval
     *
     *  |----------------|     (interval)
     *     <-------->          (read)
     */
    public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}

    /**
     * Creates a SAMFileWriter using all of the features currently set in the engine (command line arguments, ReadTransformers, etc)
     * @param file the filename to write to
     * @param engine the engine
     * @return a SAMFileWriter with the correct options set
     */
    public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine) {
        final SAMFileWriterStub output = new SAMFileWriterStub(engine, new File(file));
        output.processArguments(engine.getArguments());
        return output;
    }

    /**
     *  As {@link #createSAMFileWriter(String, org.broadinstitute.gatk.engine.GenomeAnalysisEngine)}, but also sets the header
     */
    public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine, final SAMFileHeader header) {
        final SAMFileWriterStub output = (SAMFileWriterStub) createSAMFileWriter(file, engine);
        output.writeHeader(header);
        return output;
    }

    /**
     * is this base inside the adaptor of the read?
     *
     * There are two cases to treat here:
     *
     * 1) Read is in the negative strand => Adaptor boundary is on the left tail
     * 2) Read is in the positive strand => Adaptor boundary is on the right tail
     *
     * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event)
     *
     * @param read the read to test
     * @param basePos base position in REFERENCE coordinates (not read coordinates)
     * @return whether or not the base is in the adaptor
     */
    public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) {
        final int adaptorBoundary = read.getAdaptorBoundary();
        if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE)
            return false;

        return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary;
    }

    /**
     * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to
     * the read boundary. If the read is in the positive strand, this is the first base after the end of the
     * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the
     * beginning of the fragment.
     *
     * There are two cases we need to treat here:
     *
     * 1) Our read is in the reverse strand :
     *
     *     <----------------------| *
     *   |--------------------->
     *
     *   in these cases, the adaptor boundary is at the mate start (minus one)
     *
     * 2) Our read is in the forward strand :
     *
     *   |---------------------->   *
     *     <----------------------|
     *
     *   in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
     *
     * @param read the read being tested for the adaptor boundary
     * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read.
     * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig.
     */
    public static int getAdaptorBoundary(final SAMRecord read) {
        if ( ! hasWellDefinedFragmentSize(read) ) {
            return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
        } else if ( read.getReadNegativeStrandFlag() ) {
            return read.getMateAlignmentStart() - 1;           // case 1 (see header)
        } else {
            final int insertSize = Math.abs(read.getInferredInsertSize());    // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
            return read.getAlignmentStart() + insertSize + 1// case 2 (see header)
        }
    }

    public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE;

    /**
     * Can the adaptor sequence of read be reliably removed from the read based on the alignment of
     * read and its mate?
     *
     * @param read the read to check
     * @return true if it can, false otherwise
     */
    public static boolean hasWellDefinedFragmentSize(final SAMRecord read) {
        if ( read.getInferredInsertSize() == 0 )
            // no adaptors in reads with mates in another chromosome or unmapped pairs
            return false;
        if ( ! read.getReadPairedFlag() )
            // only reads that are paired can be adaptor trimmed
            return false;
        if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() )
            // only reads when both reads are mapped can be trimmed
            return false;
//        if ( ! read.getProperPairFlag() )
//            // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs
//            // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed
//            return false;
        if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() )
            // sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand
            return false;

        if ( read.getReadNegativeStrandFlag() ) {
            // we're on the negative strand, so our read runs right to left
            return read.getAlignmentEnd() > read.getMateAlignmentStart();
        } else {
            // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start)
            return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize();
        }
    }

    /**
     * is the read a 454 read?
     *
     * @param read the read to test
     * @return checks the read group tag PL for the default 454 tag
     */
    public static boolean is454Read(GATKSAMRecord read) {
        return NGSPlatform.fromRead(read) == NGSPlatform.LS454;
    }

    /**
     * is the read an IonTorrent read?
     *
     * @param read the read to test
     * @return checks the read group tag PL for the default ion tag
     */
    public static boolean isIonRead(GATKSAMRecord read) {
        return NGSPlatform.fromRead(read) == NGSPlatform.ION_TORRENT;
    }

    /**
     * is the read a SOLiD read?
     *
     * @param read the read to test
     * @return checks the read group tag PL for the default SOLiD tag
     */
    public static boolean isSOLiDRead(GATKSAMRecord read) {
        return NGSPlatform.fromRead(read) == NGSPlatform.SOLID;
    }

    /**
     * is the read a SLX read?
     *
     * @param read the read to test
     * @return checks the read group tag PL for the default SLX tag
     */
    public static boolean isIlluminaRead(GATKSAMRecord read) {
        return NGSPlatform.fromRead(read) == NGSPlatform.ILLUMINA;
    }

    /**
     * checks if the read has a platform tag in the readgroup equal to 'name'.
     * Assumes that 'name' is upper-cased.
     *
     * @param read the read to test
     * @param name the upper-cased platform name to test
     * @return whether or not name == PL tag in the read group of read
     */
    public static boolean isPlatformRead(GATKSAMRecord read, String name) {

        SAMReadGroupRecord readGroup = read.getReadGroup();
        if (readGroup != null) {
            Object readPlatformAttr = readGroup.getAttribute("PL");
            if (readPlatformAttr != null)
                return readPlatformAttr.toString().toUpperCase().contains(name);
        }
        return false;
    }


    /**
     * Returns the collections of reads sorted in coordinate order, according to the order defined
     * in the reads themselves
     *
     * @param reads
     * @return
     */
    public final static List<GATKSAMRecord> sortReadsByCoordinate(List<GATKSAMRecord> reads) {
        final SAMRecordComparator comparer = new SAMRecordCoordinateComparator();
        Collections.sort(reads, comparer);
        return reads;
    }

    /**
     * If a read starts in INSERTION, returns the first element length.
     *
     * Warning: If the read has Hard or Soft clips before the insertion this function will return 0.
     *
     * @param read
     * @return the length of the first insertion, or 0 if there is none (see warning).
     */
    public final static int getFirstInsertionOffset(SAMRecord read) {
        CigarElement e = read.getCigar().getCigarElement(0);
        if ( e.getOperator() == CigarOperator.I )
            return e.getLength();
        else
            return 0;
    }

    /**
     * If a read ends in INSERTION, returns the last element length.
     *
     * Warning: If the read has Hard or Soft clips after the insertion this function will return 0.
     *
     * @param read
     * @return the length of the last insertion, or 0 if there is none (see warning).
     */
    public final static int getLastInsertionOffset(SAMRecord read) {
        CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1);
        if ( e.getOperator() == CigarOperator.I )
            return e.getLength();
        else
            return 0;
    }

    /**
     * Determines what is the position of the read in relation to the interval.
     * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison.
     * @param read the read
     * @param interval the interval
     * @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
     */
    public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) {

        int sStart = read.getSoftStart();
        int sStop = read.getSoftEnd();
        int uStart = read.getUnclippedStart();
        int uStop = read.getUnclippedEnd();

        if ( !read.getReferenceName().equals(interval.getContig()) )
            return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG;

        else if ( uStop < interval.getStart() )
            return ReadAndIntervalOverlap.NO_OVERLAP_LEFT;

        else if ( uStart > interval.getStop() )
            return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT;

        else if ( sStop < interval.getStart() )
            return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT;

        else if ( sStart > interval.getStop() )
            return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT;

        else if ( (sStart >= interval.getStart()) &&
                  (sStop <= interval.getStop()) )
            return ReadAndIntervalOverlap.OVERLAP_CONTAINED;

        else if ( (sStart < interval.getStart()) &&
                  (sStop > interval.getStop()) )
            return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT;

        else if ( (sStart < interval.getStart()) )
            return ReadAndIntervalOverlap.OVERLAP_LEFT;

        else
            return ReadAndIntervalOverlap.OVERLAP_RIGHT;
    }

    /**
     * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of
     * two corner cases:
     *
     * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside
     * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it
     * doesn't matter because it already returns the previous base by default.
     *
     * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the
     * read starts with an insertion, and you're requesting the first read based coordinate, it will skip
     * the leading insertion (because it has the same reference coordinate as the following base).
     *
     * @param read
     * @param refCoord
     * @param tail
     * @return the read coordinate corresponding to the requested reference coordinate for clipping.
     */
    @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"})
    @Ensures({"result >= 0", "result < read.getReadLength()"})
    public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) {
        return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false);
    }

    public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) {
        final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord);
        return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false);
    }

    public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) {
        Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached);
        int readCoord = result.getFirst();

        // Corner case one: clipping the right tail and falls on deletion, move to the next
        // read coordinate. It is not a problem for the left tail because the default answer
        // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate.
        if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL)
            readCoord++;

        // clipping the left tail and first base is insertion, go to the next read coordinate
        // with the same reference coordinate. Advance to the next cigar element, or to the
        // end of the read if there is no next element.
        final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar);
        if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null)
            readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1);

        return readCoord;
    }

    /**
     * Returns the read coordinate corresponding to the requested reference coordinate.
     *
     * WARNING: if the requested reference coordinate happens to fall inside or just before a deletion (or skipped region) in the read, this function
     * will return the last read base before the deletion (or skipped region). This function returns a
     * Pair(int readCoord, boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion) so you can choose which readCoordinate to use when faced with
     * a deletion (or skipped region).
     *
     * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a
     * pre-processed result according to normal clipping needs. Or you can use this function and tailor the
     * behavior to your needs.
     *
     * @param read
     * @param refCoord the requested reference coordinate
     * @return the read coordinate corresponding to the requested reference coordinate. (see warning!)
     */
    @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"})
    @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"})
    //TODO since we do not have contracts any more, should we check for the requirements in the method code?
    public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) {
        return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false);
    }

    public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) {
        int readBases = 0;
        int refBases = 0;
        boolean fallsInsideDeletionOrSkippedRegion = false;
        boolean endJustBeforeDeletionOrSkippedRegion = false;
        boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false;

        final int goal = refCoord - alignmentStart;  // The goal is to move this many reference bases
        if (goal < 0) {
            if (allowGoalNotReached) {
                return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
            } else {
                throw new ReviewedGATKException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
            }
        }
        boolean goalReached = refBases == goal;

        Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
        while (!goalReached && cigarElementIterator.hasNext()) {
            final CigarElement cigarElement = cigarElementIterator.next();
            int shift = 0;

            if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
                if (refBases + cigarElement.getLength() < goal)
                    shift = cigarElement.getLength();
                else
                    shift = goal - refBases;

                refBases += shift;
            }
            goalReached = refBases == goal;

            if (!goalReached && cigarElement.getOperator().consumesReadBases())
                readBases += cigarElement.getLength();

            if (goalReached) {
                // Is this base's reference position within this cigar element? Or did we use it all?
                final boolean endsWithinCigar = shift < cigarElement.getLength();

                // If it isn't, we need to check the next one. There should *ALWAYS* be a next one
                // since we checked if the goal coordinate is within the read length, so this is just a sanity check.
                if (!endsWithinCigar && !cigarElementIterator.hasNext()) {
                    if (allowGoalNotReached) {
                        return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
                    } else {
                        throw new ReviewedGATKException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s  and cigar: %s", alignmentStart, cigar));
                    }
                }

                CigarElement nextCigarElement = null;

                // if we end inside the current cigar element, we just have to check if it is a deletion (or skipped region)
                if (endsWithinCigar)
                    fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) ;

                // if we end outside the current cigar element, we need to check if the next element is an insertion, deletion or skipped region.
                else {
                    nextCigarElement = cigarElementIterator.next();

                    // if it's an insertion, we need to clip the whole insertion before looking at the next element
                    if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
                        readBases += nextCigarElement.getLength();
                        if (!cigarElementIterator.hasNext()) {
                            if (allowGoalNotReached) {
                                return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
                            } else {
                                throw new ReviewedGATKException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s  and cigar: %s", alignmentStart, cigar));
                            }
                        }

                        nextCigarElement = cigarElementIterator.next();
                    }

                    // if it's a deletion (or skipped region), we will pass the information on to be handled downstream.
                    endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION);
                }

                fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion;

                // If we reached our goal outside a deletion (or skipped region), add the shift
                if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases())
                    readBases += shift;

                // If we reached our goal just before a deletion (or skipped region) we need
                // to add the shift of the current cigar element but go back to it's last element to return the last
                // base before the deletion (or skipped region) (see warning in function contracts)
                else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases())
                    readBases += shift - 1;

                // If we reached our goal inside a deletion (or skipped region), or just between a deletion and a skipped region,
                // then we must backtrack to the last base before the deletion (or skipped region)
                else if (fallsInsideDeletionOrSkippedRegion ||
                        (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) ||
                        (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D)))
                    readBases--;
            }
        }

        if (!goalReached) {
            if (allowGoalNotReached) {
                return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
            } else {
                throw new ReviewedGATKException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar);
            }
        }

        return new Pair<Integer, Boolean>(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion);
    }

    /**
     * Compares two SAMRecords only the basis on alignment start.  Note that
     * comparisons are performed ONLY on the basis of alignment start; any
     * two SAM records with the same alignment start will be considered equal.
     *
     * Unmapped alignments will all be considered equal.
     */

    @Requires({"read1 != null", "read2 != null"})
    public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) {
        AlignmentStartComparator comp = new AlignmentStartComparator();
        return comp.compare(read1, read2);
    }

    /**
     * Is a base inside a read?
     *
     * @param read                the read to evaluate
     * @param referenceCoordinate the reference coordinate of the base to test
     * @return true if it is inside the read, false otherwise.
     */
    public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) {
        return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd();
    }

    /**
     * Is this read all insertion?
     *
     * @param read
     * @return whether or not the only element in the cigar string is an Insertion
     */
    public static boolean readIsEntirelyInsertion(GATKSAMRecord read) {
        for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
            if (cigarElement.getOperator() != CigarOperator.INSERTION)
                return false;
        }
        return true;
    }

    /**
     * @see #readStartsWithInsertion(htsjdk.samtools.Cigar, boolean) with ignoreClipOps set to true
     */
    public static CigarElement readStartsWithInsertion(final Cigar cigarForRead) {
        return readStartsWithInsertion(cigarForRead, true);
    }

    /**
     * Checks if a read starts with an insertion.
     *
     * @param cigarForRead    the CIGAR to evaluate
     * @param ignoreSoftClipOps   should we ignore S operators when evaluating whether an I operator is at the beginning?  Note that H operators are always ignored.
     * @return the element if it's a leading insertion or null otherwise
     */
    public static CigarElement readStartsWithInsertion(final Cigar cigarForRead, final boolean ignoreSoftClipOps) {
        for ( final CigarElement cigarElement : cigarForRead.getCigarElements() ) {
            if ( cigarElement.getOperator() == CigarOperator.INSERTION )
                return cigarElement;

            else if ( cigarElement.getOperator() != CigarOperator.HARD_CLIP && ( !ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP) )
                break;
        }
        return null;
    }

    /**
     * Returns the coverage distribution of a list of reads within the desired region.
     *
     * See getCoverageDistributionOfRead for information on how the coverage is calculated.
     *
     * @param list          the list of reads covering the region
     * @param startLocation the first reference coordinate of the region (inclusive)
     * @param stopLocation  the last reference coordinate of the region (inclusive)
     * @return an array with the coverage of each position from startLocation to stopLocation
     */
    public static int [] getCoverageDistributionOfReads(List<GATKSAMRecord> list, int startLocation, int stopLocation) {
        int [] totalCoverage = new int[stopLocation - startLocation + 1];

        for (GATKSAMRecord read : list) {
            int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
            totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage);
        }

        return totalCoverage;
    }

    /**
     * Returns the coverage distribution of a single read within the desired region.
     *
     * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
     * reads for variant regions, and deletions count as variants)
     *
     * @param read          the read to get the coverage distribution of
     * @param startLocation the first reference coordinate of the region (inclusive)
     * @param stopLocation  the last reference coordinate of the region (inclusive)
     * @return an array with the coverage of each position from startLocation to stopLocation
     */
    public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) {
        int [] coverage = new int[stopLocation - startLocation + 1];
        int refLocation = read.getSoftStart();
        for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
            switch (cigarElement.getOperator()) {
                case S:
                case M:
                case EQ:
                case N:
                case X:
                case D:
                    for (int i = 0; i < cigarElement.getLength(); i++) {
                        if (refLocation >= startLocation && refLocation <= stopLocation) {
                            coverage[refLocation - startLocation]++;
                        }
                        refLocation++;
                    }
                    break;

                case P:
                case I:
                case H:
                    break;
            }

            if (refLocation > stopLocation)
                break;
        }
        return coverage;
    }

    /**
     * Makes association maps for the reads and loci coverage as described below :
     *
     *  - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage.
     *    Note: Locus is in reference coordinates.
     *    Example: Locus => {read1, read2, ..., readN}
     *
     *  - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage.
     *    Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage.
     *    Example: Read => {true, true, false, ... false}
     *
     * @param readList      the list of reads to generate the association mappings
     * @param startLocation the first reference coordinate of the region (inclusive)
     * @param stopLocation  the last reference coordinate of the region (inclusive)
     * @return the two hashmaps described above
     */
    public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>> , HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings (List<GATKSAMRecord> readList, int startLocation, int stopLocation) {
        int arraySize = stopLocation - startLocation + 1;

        HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
        HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);

        for (int i = startLocation; i <= stopLocation; i++)
            locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists

        for (GATKSAMRecord read : readList) {
            readToLocusMap.put(read, new Boolean[arraySize]);       // Initialize the readToLocus map with empty arrays

            int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);

            for (int i = 0; i < readCoverage.length; i++) {
                int refLocation = i + startLocation;
                if (readCoverage[i] > 0) {
                    // Update the hash for this locus
                    HashSet<GATKSAMRecord> readSet = locusToReadMap.get(refLocation);
                    readSet.add(read);

                    // Add this locus to the read hash
                    readToLocusMap.get(read)[refLocation - startLocation] = true;
                }
                else
                    // Update the boolean array with a 'no coverage' from this read to this locus
                    readToLocusMap.get(read)[refLocation-startLocation] = false;
            }
        }
        return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
    }

    /**
     * Create random read qualities
     *
     * @param length the length of the read
     * @return an array with randomized base qualities between 0 and 50
     */
    public static byte[] createRandomReadQuals(int length) {
        Random random = GenomeAnalysisEngine.getRandomGenerator();
        byte[] quals = new byte[length];
        for (int i = 0; i < length; i++)
            quals[i] = (byte) random.nextInt(50);
        return quals;
    }

    /**
     * Create random read qualities
     *
     * @param length  the length of the read
     * @param allowNs whether or not to allow N's in the read
     * @return an array with randomized bases (A-N) with equal probability
     */
    public static byte[] createRandomReadBases(int length, boolean allowNs) {
        Random random = GenomeAnalysisEngine.getRandomGenerator();
        int numberOfBases = allowNs ? 5 : 4;
        byte[] bases = new byte[length];
        for (int i = 0; i < length; i++) {
            switch (random.nextInt(numberOfBases)) {
                case 0:
                    bases[i] = 'A';
                    break;
                case 1:
                    bases[i] = 'C';
                    break;
                case 2:
                    bases[i] = 'G';
                    break;
                case 3:
                    bases[i] = 'T';
                    break;
                case 4:
                    bases[i] = 'N';
                    break;
                default:
                    throw new ReviewedGATKException("Something went wrong, this is just impossible");
            }
        }
        return bases;
    }

    public static GATKSAMRecord createRandomRead(int length) {
        return createRandomRead(length, true);
    }

    public static GATKSAMRecord createRandomRead(int length, boolean allowNs) {
        byte[] quals = ReadUtils.createRandomReadQuals(length);
        byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs);
        return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
    }


    public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
        String[] sequenceRecordNames = new String[sequenceDictionary.size()];
        int sequenceRecordIndex = 0;
        for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
            sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
        return Arrays.deepToString(sequenceRecordNames);
    }

    /**
     * Calculates the reference coordinate for a read coordinate
     *
     * @param read   the read
     * @param offset the base in the read (coordinate in the read)
     * @return the reference coordinate correspondent to this base
     */
    public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) {
        if (offset > read.getReadLength())
            throw new ReviewedGATKException(String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength()));

        long location = read.getAlignmentStart();
        Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
        while (offset > 0 && cigarElementIterator.hasNext()) {
            CigarElement cigarElement = cigarElementIterator.next();
            long move = 0;
            if (cigarElement.getOperator().consumesReferenceBases()) 
                move = (long) Math.min(cigarElement.getLength(), offset);
            location += move;
            offset -= move;
        }
        if (offset > 0 && !cigarElementIterator.hasNext())
            throw new ReviewedGATKException(OFFSET_NOT_ZERO_EXCEPTION);

        return location;
    }

    /**
     * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened.
     *
     * Example:
     *  D -> 2, 34, 75
     *  I -> 55
     *  S -> 0, 101
     *  H -> 101
     *
     * @param read the read
     * @return a map with the properties described above. See example
     */
    public static Map<CigarOperator, ArrayList<Integer>> getCigarOperatorForAllBases (GATKSAMRecord read) {
        Map<CigarOperator, ArrayList<Integer>> events = new HashMap<CigarOperator, ArrayList<Integer>>();

        int position = 0;
        for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
            CigarOperator op = cigarElement.getOperator();
            if (op.consumesReadBases()) {
                ArrayList<Integer> list = events.get(op);
                if (list == null) {
                    list = new ArrayList<Integer>();
                    events.put(op, list);
                }
                for (int i = position; i < cigarElement.getLength(); i++)
                    list.add(position++);
            }
            else {
                ArrayList<Integer> list = events.get(op);
                if (list == null) {
                    list = new ArrayList<Integer>();
                    events.put(op, list);
                }
                list.add(position);
            }
        }
        return events;
    }

    /**
     * Given a read, outputs the read bases in a string format
     *
     * @param read the read
     * @return a string representation of the read bases
     */
    public static String convertReadBasesToString(GATKSAMRecord read) {
        String bases = "";
        for (byte b : read.getReadBases()) {
            bases += (char) b;
        }
        return bases.toUpperCase();
    }

    /**
     * Given a read, outputs the base qualities in a string format
     *
     * @param quals the read qualities
     * @return a string representation of the base qualities
     */
    public static String convertReadQualToString(byte[] quals) {
        String result = "";
        for (byte b : quals) {
            result += (char) (33 + b);
        }
        return result;
    }

    /**
     * Given a read, outputs the base qualities in a string format
     *
     * @param read the read
     * @return a string representation of the base qualities
     */
    public static String convertReadQualToString(GATKSAMRecord read) {
        return convertReadQualToString(read.getBaseQualities());
    }

    /**
     * Returns the reverse complement of the read bases
     *
     * @param bases the read bases
     * @return the reverse complement of the read bases
     */
    public static String getBasesReverseComplement(byte[] bases) {
        String reverse = "";
        for (int i = bases.length-1; i >=0; i--) {
            reverse += (char) BaseUtils.getComplement(bases[i]);
        }
        return reverse;
    }

    /**
     * Returns the reverse complement of the read bases
     *
     * @param read the read
     * @return the reverse complement of the read bases
     */
    public static String getBasesReverseComplement(GATKSAMRecord read) {
        return getBasesReverseComplement(read.getReadBases());
    }

    /**
     * Calculate the maximum read length from the given list of reads.
     * @param reads list of reads
     * @return      non-negative integer
     */
    @Ensures({"result >= 0"})
    public static int getMaxReadLength( final List<GATKSAMRecord> reads ) {
        if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); }

        int maxReadLength = 0;
        for( final GATKSAMRecord read : reads ) {
            maxReadLength = Math.max(maxReadLength, read.getReadLength());
        }
        return maxReadLength;
    }
}
TOP

Related Classes of org.broadinstitute.gatk.utils.sam.ReadUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.