Package org.broadinstitute.gatk.utils.sam

Source Code of org.broadinstitute.gatk.utils.sam.AlignmentUtils$CigarPairTransform

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.sam;

import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.recalibration.EventType;
import org.broadinstitute.gatk.utils.smithwaterman.SWPairwiseAlignment;

import java.util.*;


public final class AlignmentUtils {
    private final static EnumSet<CigarOperator> ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
    private final static EnumSet<CigarOperator> ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S);
    public final static String HAPLOTYPE_TAG = "HC";

    // cannot be instantiated
    private AlignmentUtils() { }

    /**
     * Does cigar start or end with a deletion operation?
     *
     * @param cigar a non-null cigar to test
     * @return true if the first or last operator of cigar is a D
     */
    public static boolean startsOrEndsWithInsertionOrDeletion(final Cigar cigar) {
        if ( cigar == null ) throw new IllegalArgumentException("Cigar cannot be null");

        if ( cigar.isEmpty() )
            return false;

        final CigarOperator first = cigar.getCigarElement(0).getOperator();
        final CigarOperator last = cigar.getCigarElement(cigar.numCigarElements()-1).getOperator();
        return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I;
    }

    /**
     * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference
     * via the alignment of haplotype (via its getCigar) method.
     *
     * @param originalRead the read we want to write aligned to the reference genome
     * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
     * @param referenceStart the start of the reference that haplotype is aligned to.  Provides global coordinate frame.
     * @param isInformative true if the read is differentially informative for one of the haplotypes
     *
     * @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it
     *   does not have a Cigar or the {@code referenceStart} is invalid (less than 1).
     *
     * @return a GATKSAMRecord aligned to reference. Never {@code null}.
     */
    public static GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead,
                                                       final Haplotype haplotype,
                                                       final int referenceStart,
                                                       final boolean isInformative) {
        if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null");
        if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null");
        if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype);
        if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart);

        // compute the smith-waterman alignment of read -> haplotype
        final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), CigarUtils.NEW_SW_PARAMETERS);
        if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 )
            // sw can fail (reasons not clear) so if it happens just don't realign the read
            return originalRead;
        final Cigar swCigar = consolidateCigar(swPairwiseAlignment.getCigar());

        // since we're modifying the read we need to clone it
        final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone();

        // only informative reads are given the haplotype tag to enhance visualization
        if ( isInformative )
            read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode());

        // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar
        final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000);
        final int readStartOnHaplotype = calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1());
        final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;
        read.setAlignmentStart(readStartOnReference);
        read.resetSoftStartAndEnd();

        // compute the read -> ref alignment by mapping read -> hap -> ref from the
        // SW of read -> hap mapped through the given by hap -> ref
        final Cigar haplotypeToRef = trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1);
        final Cigar readToRefCigarRaw = applyCigarToCigar(swCigar, haplotypeToRef);
        final Cigar readToRefCigarClean = cleanUpCigar(readToRefCigarRaw);
        final Cigar readToRefCigar = leftAlignIndel(readToRefCigarClean, haplotype.getBases(),
                originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true);

        read.setCigar(readToRefCigar);

        if ( readToRefCigar.getReadLength() != read.getReadLength() )
            throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength()
                    + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength()
                    + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength());

        return read;
    }



    /**
     * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the
     * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference
     *
     * refStart and refEnd are 0 based offsets that we want to obtain.  In the client code, if the reference
     * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X.
     *
     * If refStart or refEnd would start or end the new bases within a deletion, this function will return null
     *
     * @param bases
     * @param refStart
     * @param refEnd
     * @param basesStartOnRef where does the bases array start w.r.t. the reference start?  For example, bases[0] of
     *                        could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at
     *                        10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10.
     *                        It's not trivial to eliminate this parameter because it's tied up with the cigar
     * @param basesToRefCigar the cigar that maps the bases to the reference genome
     * @return a byte[] containing the bases covering this interval, or null if we would start or end within a deletion
     */
    public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) {
        if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd);
        if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef);
        if ( bases == null ) throw new IllegalArgumentException("Bases cannot be null");
        if ( basesToRefCigar == null ) throw new IllegalArgumentException("basesToRefCigar cannot be null");
        if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar);

        int refPos = basesStartOnRef;
        int basesPos = 0;
        int basesStart = -1;
        int basesStop = -1;
        boolean done = false;

        for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) {
            final CigarElement ce = basesToRefCigar.getCigarElement(iii);
            switch ( ce.getOperator() ) {
                case I:
                    basesPos += ce.getLength();
                    break;
                case M: case X: case EQ:
                    for ( int i = 0; i < ce.getLength(); i++ ) {
                        if ( refPos == refStart )
                            basesStart = basesPos;
                        if ( refPos == refEnd ) {
                            basesStop = basesPos;
                            done = true;
                            break;
                        }
                        refPos++;
                        basesPos++;
                    }
                    break;
                case D:
                    for ( int i = 0; i < ce.getLength(); i++ ) {
                        if ( refPos == refEnd || refPos == refStart ) {
                            // if we ever reach a ref position that is either a start or an end, we fail
                            return null;
                        }
                        refPos++;
                    }
                    break;
                default:
                    throw new IllegalStateException("Unsupported operator " + ce);
            }
        }

        if ( basesStart == -1 || basesStop == -1 )
            throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar);

        return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
    }

    /**
     * Get the number of bases at which refSeq and readSeq differ, given their alignment
     *
     * @param cigar the alignment of readSeq to refSeq
     * @param refSeq the bases of the reference sequence
     * @param readSeq the bases of the read sequence
     * @return the number of bases that differ between refSeq and readSeq
     */
    public static int calcNumDifferentBases(final Cigar cigar, final byte[] refSeq, final byte[] readSeq) {
        int refIndex = 0, readIdx = 0, delta = 0;

        for (final CigarElement ce : cigar.getCigarElements()) {
            final int elementLength = ce.getLength();
            switch (ce.getOperator()) {
                case X:case EQ:case M:
                    for (int j = 0; j < elementLength; j++, refIndex++, readIdx++)
                        delta += refSeq[refIndex] != readSeq[readIdx] ? 1 : 0;
                    break;
                case I:
                    delta += elementLength;
                case S:
                    readIdx += elementLength;
                    break;
                case D:
                    delta += elementLength;
                case N:
                    refIndex += elementLength;
                    break;
                case H:
                case P:
                    break;
                default:
                    throw new ReviewedGATKException("The " + ce.getOperator() + " cigar element is not currently supported");
            }
        }

        return delta;
    }

    public static class MismatchCount {
        public int numMismatches = 0;
        public long mismatchQualities = 0;
    }

    public static long mismatchingQualities(GATKSAMRecord r, byte[] refSeq, int refIndex) {
        return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
    }

    /**
     * @see #getMismatchCount(GATKSAMRecord, byte[], int, int, int) with startOnRead == 0 and nReadBases == read.getReadLength()
     */
    public static MismatchCount getMismatchCount(GATKSAMRecord r, byte[] refSeq, int refIndex) {
        return getMismatchCount(r, refSeq, refIndex, 0, r.getReadLength());
    }

    // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
    // todo -- high performance implementation.  We can do a lot better than this right now

    /**
     * Count how many bases mismatch the reference.  Indels are not considered mismatching.
     *
     * @param r                   the sam record to check against
     * @param refSeq              the byte array representing the reference sequence
     * @param refIndex            the index in the reference byte array of the read's first base (the reference index
     *                            is matching the alignment start, there may be tons of soft-clipped bases before/after
     *                            that so it's wrong to compare with getReadLength() here.).  Note that refIndex is
     *                            zero based, not 1 based
     * @param startOnRead         the index in the read's bases from which we start counting
     * @param nReadBases          the number of bases after (but including) startOnRead that we check
     * @return non-null object representing the mismatch count
     */
    @Ensures("result != null")
    public static MismatchCount getMismatchCount(GATKSAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
        if ( r == null ) throw new IllegalArgumentException("attempting to calculate the mismatch count from a read that is null");
        if ( refSeq == null ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference sequence that is null");
        if ( refIndex < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference index that is negative");
        if ( startOnRead < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a read start that is negative");
        if ( nReadBases < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count for a negative number of read bases");
        if ( refSeq.length - refIndex < (r.getAlignmentEnd() - r.getAlignmentStart()) )
            throw new IllegalArgumentException("attempting to calculate the mismatch count against a reference string that is smaller than the read");

        MismatchCount mc = new MismatchCount();

        int readIdx = 0;
        final int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count (note we are including soft-clipped bases with this math)
        final byte[] readSeq = r.getReadBases();
        final Cigar c = r.getCigar();
        final byte[] readQuals = r.getBaseQualities();
        for (final CigarElement ce : c.getCigarElements()) {

            if (readIdx > endOnRead)
                break;

            final int elementLength = ce.getLength();
            switch (ce.getOperator()) {
                case X:
                    mc.numMismatches += elementLength;
                    for (int j = 0; j < elementLength; j++)
                        mc.mismatchQualities += readQuals[readIdx+j];
                case EQ:
                    refIndex += elementLength;
                    readIdx += elementLength;
                break;
                case M:
                    for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) {
                        if (refIndex >= refSeq.length)
                            continue;                      // TODO : It should never happen, we should throw exception here
                        if (readIdx < startOnRead) continue;
                        if (readIdx > endOnRead) break;
                        byte refChr = refSeq[refIndex];
                        byte readChr = readSeq[readIdx];
                        // Note: we need to count X/N's as mismatches because that's what SAM requires
                        //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
                        //     BaseUtils.simpleBaseToBaseIndex(refChr)  == -1 )
                        //    continue; // do not count Ns/Xs/etc ?
                        if (readChr != refChr) {
                            mc.numMismatches++;
                            mc.mismatchQualities += readQuals[readIdx];
                        }
                    }
                    break;
                case I:
                case S:
                    readIdx += elementLength;
                    break;
                case D:
                case N:
                    refIndex += elementLength;
                    break;
                case H:
                case P:
                    break;
                default:
                    throw new ReviewedGATKException("The " + ce.getOperator() + " cigar element is not currently supported");
            }

        }
        return mc;
    }

    /**
     * Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
     * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
     * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
     * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed.
     * Formally, this method simply returns the number of M elements in the cigar.
     *
     * @param r alignment
     * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored).
     */
    @Ensures("result >= 0")
    public static int getNumAlignmentBlocks(final SAMRecord r) {
        if ( r == null ) throw new IllegalArgumentException("read cannot be null");
        final Cigar cigar = r.getCigar();
        if (cigar == null) return 0;

        int n = 0;
        for (final CigarElement e : cigar.getCigarElements()) {
            if (ALIGNED_TO_GENOME_OPERATORS.contains(e.getOperator()))
                n++;
        }

        return n;
    }


    /**
     * Get the number of bases aligned to the genome, including soft clips
     *
     * If read is not mapped (i.e., doesn't have a cigar) returns 0
     *
     * @param r a non-null GATKSAMRecord
     * @return the number of bases aligned to the genome in R, including soft clipped bases
     */
    public static int getNumAlignedBasesCountingSoftClips(final GATKSAMRecord r) {
        int n = 0;
        final Cigar cigar = r.getCigar();
        if (cigar == null) return 0;

        for (final CigarElement e : cigar.getCigarElements())
            if (ALIGNED_TO_GENOME_PLUS_SOFTCLIPS.contains(e.getOperator()))
                n += e.getLength();

        return n;
    }

    /**
     * Count the number of bases hard clipped from read
     *
     * If read's cigar is null, return 0
     *
     * @param r a non-null read
     * @return a positive integer
     */
    @Ensures("result >= 0")
    public static int getNumHardClippedBases(final SAMRecord r) {
        if ( r == null ) throw new IllegalArgumentException("Read cannot be null");

        int n = 0;
        final Cigar cigar = r.getCigar();
        if (cigar == null) return 0;

        for (final CigarElement e : cigar.getCigarElements())
            if (e.getOperator() == CigarOperator.H)
                n += e.getLength();

        return n;
    }

    /**
     * Calculate the number of bases that are soft clipped in read with quality score greater than threshold
     *
     * Handles the case where the cigar is null (i.e., the read is unmapped), returning 0
     *
     * @param read a non-null GATKSAMRecord.
     * @param qualThreshold consider bases with quals > this value as high quality.  Must be >= 0
     * @return positive integer
     */
    @Ensures("result >= 0")
    public static int calcNumHighQualitySoftClips( final GATKSAMRecord read, final byte qualThreshold ) {
        if ( read == null ) throw new IllegalArgumentException("Read cannot be null");
        if ( qualThreshold < 0 ) throw new IllegalArgumentException("Expected qualThreshold to be a positive byte but saw " + qualThreshold);

        if ( read.getCigar() == null ) // the read is unmapped
            return 0;

        final byte[] qual = read.getBaseQualities( EventType.BASE_SUBSTITUTION );

        int numHQSoftClips = 0;
        int alignPos = 0;
        for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
            final int elementLength = ce.getLength();

            switch( ce.getOperator() ) {
                case S:
                    for( int jjj = 0; jjj < elementLength; jjj++ ) {
                        if( qual[alignPos++] > qualThreshold ) { numHQSoftClips++; }
                    }
                    break;
                case M: case I: case EQ: case X:
                    alignPos += elementLength;
                    break;
                case H: case P: case D: case N:
                    break;
                default:
                    throw new IllegalStateException("Unsupported cigar operator: " + ce.getOperator());
            }
        }

        return numHQSoftClips;
    }

    public static int calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus) {
        return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isDeletion(), alignmentStart, refLocus );
    }

    /**
     * Calculate the index into the read's bases of the beginning of the encompassing cigar element for a given cigar and offset
     *
     * @param cigar            the read's CIGAR -- cannot be null
     * @param offset           the offset to use for the calculation or -1 if in the middle of a deletion
     * @param isDeletion       are we in the middle of a deletion?
     * @param alignmentStart   the alignment start of the read
     * @param refLocus         the reference position of the offset
     * @return a non-negative int index
     */
    @Ensures("result >= 0")
    public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus) {
        if ( cigar == null ) throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null");
        if ( offset < -1 ) throw new IllegalArgumentException("attempting to find the alignment position with an offset that is negative (and not -1)");
        if ( alignmentStart < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from an alignment start that is negative");
        if ( refLocus < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from a reference position that is negative");
        if ( offset >= cigar.getReadLength() ) throw new IllegalArgumentException("attempting to find the alignment position of an offset than is larger than the read length");

        int pileupOffset = offset;

        // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases
        if (isDeletion) {
            pileupOffset = refLocus - alignmentStart;
            final CigarElement ce = cigar.getCigarElement(0);
            if (ce.getOperator() == CigarOperator.S) {
                pileupOffset += ce.getLength();
            }
        }

        int pos = 0;
        int alignmentPos = 0;

        for (int iii = 0; iii < cigar.numCigarElements(); iii++) {
            final CigarElement ce = cigar.getCigarElement(iii);
            final int elementLength = ce.getLength();

            switch (ce.getOperator()) {
                case I:
                case S: // TODO -- I don't think that soft clips should be treated the same as inserted bases here. Investigation needed.
                    pos += elementLength;
                    if (pos >= pileupOffset) {
                        return alignmentPos;
                    }
                    break;
                case D:
                    if (!isDeletion) {
                        alignmentPos += elementLength;
                    } else {
                        if (pos + elementLength - 1 >= pileupOffset) {
                            return alignmentPos + (pileupOffset - pos);
                        } else {
                            pos += elementLength;
                            alignmentPos += elementLength;
                        }
                    }
                    break;
                case M:
                case EQ:
                case X:
                    if (pos + elementLength - 1 >= pileupOffset) {
                        return alignmentPos + (pileupOffset - pos);
                    } else {
                        pos += elementLength;
                        alignmentPos += elementLength;
                    }
                    break;
                case H:
                case P:
                case N:
                    break;
                default:
                    throw new ReviewedGATKException("Unsupported cigar operator: " + ce.getOperator());
            }
        }

        return alignmentPos;
    }

    /**
     * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions)
     *
     * @param cigar            the read's CIGAR -- cannot be null
     * @param read             the read's base array
     * @return a non-null array of bases (bytes)
     */
    @Ensures("result != null")
    public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) {
        if ( cigar == null ) throw new IllegalArgumentException("attempting to generate an alignment from a CIGAR that is null");
        if ( read == null ) throw new IllegalArgumentException("attempting to generate an alignment from a read sequence that is null");

        final int alignmentLength = cigar.getReferenceLength();
        final byte[] alignment = new byte[alignmentLength];
        int alignPos = 0;
        int readPos = 0;
        for (int iii = 0; iii < cigar.numCigarElements(); iii++) {

            final CigarElement ce = cigar.getCigarElement(iii);
            final int elementLength = ce.getLength();

            switch (ce.getOperator()) {
                case I:
                    if (alignPos > 0) {
                        final int prevPos = alignPos - 1;
                        if (alignment[prevPos] == BaseUtils.Base.A.base) {
                            alignment[prevPos] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE;
                        } else if (alignment[prevPos] == BaseUtils.Base.C.base) {
                            alignment[prevPos] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE;
                        } else if (alignment[prevPos] == BaseUtils.Base.T.base) {
                            alignment[prevPos] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE;
                        } else if (alignment[prevPos] == BaseUtils.Base.G.base) {
                            alignment[prevPos] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE;
                        }
                    }
                case S:
                    readPos += elementLength;
                    break;
                case D:
                case N:
                    for (int jjj = 0; jjj < elementLength; jjj++) {
                        alignment[alignPos++] = PileupElement.DELETION_BASE;
                    }
                    break;
                case M:
                case EQ:
                case X:
                    for (int jjj = 0; jjj < elementLength; jjj++) {
                        alignment[alignPos++] = read[readPos++];
                    }
                    break;
                case H:
                case P:
                    break;
                default:
                    throw new ReviewedGATKException("Unsupported cigar operator: " + ce.getOperator());
            }
        }
        return alignment;
    }

    /**
     * Returns true if the read does not belong to a contig, i.e. it's location is GenomeLoc.UNMAPPED.
     * NOTE: A read can have a mapped GenomeLoc and still have an unmapped flag!
     *
     * @param r record
     * @return true if read is unmapped to a genome loc
     */
    public static boolean isReadGenomeLocUnmapped(final SAMRecord r) {
        return SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(r.getReferenceName());
    }

    /**
     * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
     * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
     * alignment reference index/start.
     *
     * Our life would be so much easier if all sam files followed the specs. In reality,
     * sam files (including those generated by maq or bwa) miss headers altogether. When
     * reading such a SAM file, reference name is set, but since there is no sequence dictionary,
     * null is always returned for referenceIndex. Let's be paranoid here, and make sure that
     * we do not call the read "unmapped" when it has only reference name set with ref. index missing
     * or vice versa.
     *
     * @param r a non-null record
     * @return true if read is unmapped
     */
    public static boolean isReadUnmapped(final SAMRecord r) {
        if ( r == null )
            throw new IllegalArgumentException("Read cannot be null");

        return r.getReadUnmappedFlag() ||
               !((r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ||
                  r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) &&
                 r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START);

    }

    /**
     * Need a well-formed, consolidated Cigar string so that the left aligning code works properly.
     * For example, 1M1M1M1D2M1M --> 3M1D3M
     * If the given cigar is empty then the returned cigar will also be empty
     *
     * Note that this routine collapses cigar elements of size 0, so 2M0M => 2M
     *
     * @param c the cigar to consolidate
     * @return  a non-null cigar with consecutive matching operators merged into single operators.
     */
    @Ensures({"result != null"})
    public static Cigar consolidateCigar( final Cigar c ) {
        if ( c == null ) { throw new IllegalArgumentException("Cigar cannot be null"); }

        // fast check to determine if there's anything worth doing before we create new Cigar and actually do some work
        if ( ! needsConsolidation(c) )
            return c;

        final Cigar returnCigar = new Cigar();
        int sumLength = 0;
        CigarElement lastElement = null;

        for( final CigarElement cur : c.getCigarElements() ) {
            if ( cur.getLength() == 0 )
                continue; // don't add elements of 0 length

            if ( lastElement != null && lastElement.getOperator() != cur.getOperator() ) {
                returnCigar.add(new CigarElement(sumLength, lastElement.getOperator()));
                sumLength = 0;
            }

            sumLength += cur.getLength();
            lastElement = cur;
        }

        if ( sumLength > 0 ) {
            returnCigar.add(new CigarElement(sumLength, lastElement.getOperator()));
        }

        return returnCigar;
    }

    /**
     * Does the cigar C need to be consolidated?
     *
     * @param c a non-null cigar
     * @return true if so
     */
    private static boolean needsConsolidation(final Cigar c) {
        if ( c.numCigarElements() <= 1 )
            return false; // fast path for empty or single cigar

        CigarOperator lastOp = null;
        for( final CigarElement cur : c.getCigarElements() ) {
            if ( cur.getLength() == 0 || lastOp == cur.getOperator() )
                return true;
            lastOp = cur.getOperator();
        }

        return false;
    }

    /**
     * Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
     * starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
     * The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
     * <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
     * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
     * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
     * always the positions where the cigar starts on the ref and on the read, respectively.
     * <p/>
     * If the alignment has one or more indels, this method attempts to move them left across a stretch of repetitive bases.
     * For instance, if the original cigar specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output
     * cigar will always mark the leftmost AT as deleted. If there is no indel in the original cigar or if the indel position
     * is determined unambiguously (i.e. inserted/deleted sequence is not repeated), the original cigar is returned.
     *
     * Note that currently we do not actually support the case where there is more than one indel in the alignment.  We will throw
     * an exception if there is -- unless the
     *
     * @param cigar     structure of the original alignment
     * @param refSeq    reference sequence the read is aligned to
     * @param readSeq   read sequence
     * @param refIndex  0-based alignment start position on ref
     * @param readIndex 0-based alignment start position on read
     * @param doNotThrowExceptionForMultipleIndels  if true we will not throw an exception if we encounter multiple indels in the alignment will instead will return the original cigar
     * @return a non-null cigar, in which the indels are guaranteed to be placed at the leftmost possible position across a repeat (if any)
     */
    @Ensures("result != null")
    public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex, final boolean doNotThrowExceptionForMultipleIndels) {
        ensureLeftAlignmentHasGoodArguments(cigar, refSeq, readSeq, refIndex, readIndex);

        final int numIndels = countIndelElements(cigar);
        if ( numIndels == 0 )
            return cigar;
        if ( numIndels == 1 )
            return leftAlignSingleIndel(cigar, refSeq, readSeq, refIndex, readIndex, true);

        // if we got here then there is more than 1 indel in the alignment
        if ( doNotThrowExceptionForMultipleIndels )
            return cigar;

        throw new UnsupportedOperationException("attempting to left align a CIGAR that has more than 1 indel in its alignment but this functionality has not been implemented yet");
    }

    private static void ensureLeftAlignmentHasGoodArguments(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
        if ( cigar == null ) throw new IllegalArgumentException("attempting to left align a CIGAR that is null");
        if ( refSeq == null ) throw new IllegalArgumentException("attempting to left align a reference sequence that is null");
        if ( readSeq == null ) throw new IllegalArgumentException("attempting to left align a read sequence that is null");
        if ( refIndex < 0 ) throw new IllegalArgumentException("attempting to left align with a reference index less than 0");
        if ( readIndex < 0 ) throw new IllegalArgumentException("attempting to left align with a read index less than 0");
    }

    /**
     * Counts the number of I/D operators
     *
     * @param cigar   cigar to check -- cannot be null
     * @return  non-negative count of indel operators
     */
    @Requires("cigar != null")
    @Ensures("result >= 0")
    private static int countIndelElements(final Cigar cigar) {
        int indelCount = 0;
        for ( CigarElement ce : cigar.getCigarElements() ) {
            if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I )
                indelCount++;
        }
        return indelCount;
    }

    /**
     * See the documentation for AlignmentUtils.leftAlignIndel() for more details.
     *
     * This flavor of the left alignment works if and only if the alignment has one - and only one - indel.
     * An exception is thrown if there are no indels or more than 1 indel in the alignment.
     *
     * @param cigar     structure of the original alignment -- cannot be null
     * @param refSeq    reference sequence the read is aligned to
     * @param readSeq   read sequence
     * @param refIndex  0-based alignment start position on ref
     * @param readIndex 0-based alignment start position on read
     * @param cleanupCigar if true, we'll cleanup the resulting cigar element, removing 0 length elements and deletions from the first cigar position
     * @return a non-null cigar, in which the single indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
     */
    @Ensures("result != null")
    public static Cigar leftAlignSingleIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex, final boolean cleanupCigar) {
        ensureLeftAlignmentHasGoodArguments(cigar, refSeq, readSeq, refIndex, readIndex);

        int indexOfIndel = -1;
        for (int i = 0; i < cigar.numCigarElements(); i++) {
            CigarElement ce = cigar.getCigarElement(i);
            if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) {
                // if there is more than 1 indel, exception out
                if (indexOfIndel != -1)
                    throw new IllegalArgumentException("attempting to left align a CIGAR that has more than 1 indel in its alignment");
                indexOfIndel = i;
            }
        }

        // if there is no indel, exception out
        if ( indexOfIndel == -1 )
            throw new IllegalArgumentException("attempting to left align a CIGAR that has no indels in its alignment");
        // if the alignment starts with an insertion (so that there is no place on the read to move that insertion further left), we are done
        if ( indexOfIndel == 0 )
            return cigar;

        final int indelLength = cigar.getCigarElement(indexOfIndel).getLength();

        byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
        if (altString == null)
            return cigar;

        Cigar newCigar = cigar;
        for (int i = 0; i < indelLength; i++) {
            newCigar = moveCigarLeft(newCigar, indexOfIndel);
            byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);

            // check to make sure we haven't run off the end of the read
            boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar);

            if (Arrays.equals(altString, newAltString)) {
                cigar = newCigar;
                i = -1;
                if (reachedEndOfRead)
                    cigar = cleanupCigar ? cleanUpCigar(cigar) : cigar;
            }

            if (reachedEndOfRead)
                break;
        }

        return cigar;
    }

    /**
     * Does one of the elements in cigar have a 0 length?
     *
     * @param c a non-null cigar
     * @return true if any element has 0 size
     */
    @Requires("c != null")
    protected static boolean cigarHasZeroSizeElement(final Cigar c) {
        for (final CigarElement ce : c.getCigarElements()) {
            if (ce.getLength() == 0)
                return true;
        }
        return false;
    }

    /**
     * Clean up the incoming cigar
     *
     * Removes elements with zero size
     * Clips away beginning deletion operators
     *
     * @param c the cigar string we want to clean up
     * @return a newly allocated, cleaned up Cigar
     */
    @Requires("c != null")
    @Ensures("result != null")
    public static Cigar cleanUpCigar(final Cigar c) {
        final List<CigarElement> elements = new ArrayList<CigarElement>(c.numCigarElements() - 1);

        for (final CigarElement ce : c.getCigarElements()) {
            if (ce.getLength() != 0 && (! elements.isEmpty() || ce.getOperator() != CigarOperator.D)) {
                elements.add(ce);
            }
        }

        return new Cigar(elements);
    }

    /**
     * Removing a trailing deletion from the incoming cigar if present
     *
     * @param c the cigar we want to update
     * @return a non-null Cigar
     */
    @Requires("c != null")
    @Ensures("result != null")
    public static Cigar removeTrailingDeletions(final Cigar c) {

        final List<CigarElement> elements = c.getCigarElements();
        if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.D )
            return c;

        return new Cigar(elements.subList(0, elements.size() - 1));
    }

    /**
     * Move the indel in a given cigar string one base to the left
     *
     * @param cigar          original cigar
     * @param indexOfIndel   the index of the indel cigar element
     * @return non-null cigar with indel moved one base to the left
     */
    @Requires("cigar != null && indexOfIndel >= 0 && indexOfIndel < cigar.numCigarElements()")
    @Ensures("result != null")
    private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) {
        // get the first few elements
        ArrayList<CigarElement> elements = new ArrayList<CigarElement>(cigar.numCigarElements());
        for (int i = 0; i < indexOfIndel - 1; i++)
            elements.add(cigar.getCigarElement(i));

        // get the indel element and move it left one base
        CigarElement ce = cigar.getCigarElement(indexOfIndel - 1);
        elements.add(new CigarElement(Math.max(ce.getLength() - 1, 0), ce.getOperator()));
        elements.add(cigar.getCigarElement(indexOfIndel));
        if (indexOfIndel + 1 < cigar.numCigarElements()) {
            ce = cigar.getCigarElement(indexOfIndel + 1);
            elements.add(new CigarElement(ce.getLength() + 1, ce.getOperator()));
        } else {
            elements.add(new CigarElement(1, CigarOperator.M));
        }

        // get the last few elements
        for (int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++)
            elements.add(cigar.getCigarElement(i));
        return new Cigar(elements);
    }

    /**
     * Create the string (really a byte array) representation of an indel-containing cigar against the reference.
     *
     * @param cigar             the indel-containing cigar
     * @param indexOfIndel      the index of the indel cigar element
     * @param refSeq            the reference sequence
     * @param readSeq           the read sequence for the cigar
     * @param refIndex          the starting reference index into refSeq
     * @param readIndex         the starting read index into readSeq
     * @return non-null byte array which is the indel representation against the reference
     */
    @Requires("cigar != null && indexOfIndel >= 0 && indexOfIndel < cigar.numCigarElements() && refSeq != null && readSeq != null && refIndex >= 0 && readIndex >= 0")
    @Ensures("result != null")
    private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
        CigarElement indel = cigar.getCigarElement(indexOfIndel);
        int indelLength = indel.getLength();

        int totalRefBases = 0;
        for (int i = 0; i < indexOfIndel; i++) {
            CigarElement ce = cigar.getCigarElement(i);
            int length = ce.getLength();

            switch (ce.getOperator()) {
                case M:
                case EQ:
                case X:
                    readIndex += length;
                    refIndex += length;
                    totalRefBases += length;
                    break;
                case S:
                    readIndex += length;
                    break;
                case N:
                    refIndex += length;
                    totalRefBases += length;
                    break;
                default:
                    break;
            }
        }

        // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them
        if (totalRefBases + indelLength > refSeq.length)
            indelLength -= (totalRefBases + indelLength - refSeq.length);

        // the indel-based reference string
        byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))];

        // add the bases before the indel, making sure it's not aligned off the end of the reference
        if (refIndex > alt.length || refIndex > refSeq.length)
            return null;
        System.arraycopy(refSeq, 0, alt, 0, refIndex);
        int currentPos = refIndex;

        // take care of the indel
        if (indel.getOperator() == CigarOperator.D) {
            refIndex += indelLength;
        } else {
            System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength);
            currentPos += indelLength;
        }

        // add the bases after the indel, making sure it's not aligned off the end of the reference
        if (refSeq.length - refIndex > alt.length - currentPos)
            return null;
        System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex);

        return alt;
    }


    /**
     * Trim cigar down to one that starts at start reference on the left and extends to end on the reference
     *
     * @param cigar a non-null Cigar to trim down
     * @param start Where should we start keeping bases on the reference?  The first position is 0
     * @param end Where should we stop keeping bases on the reference?  The maximum value is cigar.getReferenceLength()
     * @return a new Cigar with reference length == start - end + 1
     */
    public static Cigar trimCigarByReference(final Cigar cigar, final int start, final int end) {
        if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start);
        if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start);
        if ( end > cigar.getReferenceLength() ) throw new IllegalArgumentException("End is beyond the cigar's reference length " + end + " for cigar " + cigar );

        final Cigar result = trimCigar(cigar, start, end, true);

        if ( result.getReferenceLength() != end - start + 1)
            throw new IllegalStateException("trimCigarByReference failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result);
        return result;
    }

    /**
     * Trim cigar down to one that starts at start base in the cigar and extends to (inclusive) end base
     *
     * @param cigar a non-null Cigar to trim down
     * @param start Where should we start keeping bases in the cigar?  The first position is 0
     * @param end Where should we stop keeping bases in the cigar?  The maximum value is cigar.getReadLength()
     * @return a new Cigar containing == start - end + 1 reads
     */
    public static Cigar trimCigarByBases(final Cigar cigar, final int start, final int end) {
        if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start);
        if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start = " + start);
        if ( end > cigar.getReadLength() ) throw new IllegalArgumentException("End is beyond the cigar's read length " + end + " for cigar " + cigar );

        final Cigar result = trimCigar(cigar, start, end, false);

        final int expectedSize = end - start + 1;
        if ( result.getReadLength() != expectedSize)
            throw new IllegalStateException("trimCigarByBases failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result + " with size " + result.getReadLength() + " expected " + expectedSize + " for input cigar " + cigar);
        return result;
    }


    /**
     * Workhorse for trimCigarByBases and trimCigarByReference
     *
     * @param cigar a non-null Cigar to trim down
     * @param start Where should we start keeping bases in the cigar?  The first position is 0
     * @param end Where should we stop keeping bases in the cigar?  The maximum value is cigar.getReadLength()
     * @param byReference should start and end be intrepreted as position in the reference or the read to trim to/from?
     * @return a non-null cigar
     */
    @Requires({"cigar != null", "start >= 0", "start <= end"})
    @Ensures("result != null")
    private static Cigar trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference) {
        final List<CigarElement> newElements = new LinkedList<CigarElement>();

        int pos = 0;
        for ( final CigarElement elt : cigar.getCigarElements() ) {
            if ( pos > end && (byReference || elt.getOperator() != CigarOperator.D) ) break;

            switch ( elt.getOperator() ) {
                case D:
                    if ( ! byReference ) {
                        if ( pos >= start )
                            newElements.add(elt);
                        break;
                    }
                    // otherwise fall through to the next case
                case EQ: case M: case X:
                    pos = addCigarElements(newElements, pos, start, end, elt);
                    break;
                case S: case I:
                    if ( byReference ) {
                        if ( pos >= start )
                            newElements.add(elt);
                    } else {
                        pos = addCigarElements(newElements, pos, start, end, elt);
                    }
                    break;
                default:
                    throw new IllegalStateException("Cannot handle " + elt);
            }
        }

        return AlignmentUtils.consolidateCigar(new Cigar(newElements));
    }

    /**
     * Helper function for trimCigar that adds cigar elements (of total length X) of elt.op to dest for
     * X bases that fall between start and end, where the last position of the base is pos.
     *
     * The primary use of this function is to create a new cigar element list that contains only
     * elements that occur between start and end bases in an initial cigar.
     *
     * Note that this function may return multiple cigar elements (1M1M etc) that are best consolidated
     * after the fact into a single simpler representation.
     *
     * @param dest we will append our cigar elements to this list
     * @param pos the position (0 indexed) where elt started
     * @param start only include bases that occur >= this position
     * @param end only include bases that occur <= this position
     * @param elt the element we are slicing down
     * @return the position after we've traversed all elt.length bases of elt
     */
   protected static int addCigarElements(final List<CigarElement> dest, int pos, final int start, final int end, final CigarElement elt) {
        final int length = Math.min(pos + elt.getLength() - 1, end) - Math.max(pos, start) + 1;
        if ( length > 0 )
            dest.add(new CigarElement(length, elt.getOperator()));
        return pos + elt.getLength();
    }

    /**
     * Get the offset (base 0) of the first reference aligned base in Cigar that occurs after readStartByBaseOfCigar base of the cigar
     *
     * The main purpose of this routine is to find a good start position for a read given it's cigar.  The real
     * challenge is that the starting base might be inside an insertion, in which case the read actually starts
     * at the next M/EQ/X operator.
     *
     * @param cigar a non-null cigar
     * @param readStartByBaseOfCigar finds the first base after this (0 indexed) that aligns to the reference genome (M, EQ, X)
     * @throws IllegalStateException if no such base can be found
     * @return an offset into cigar
     */
    public static int calcFirstBaseMatchingReferenceInCigar(final Cigar cigar, int readStartByBaseOfCigar) {
        if ( cigar == null ) throw new IllegalArgumentException("cigar cannot be null");
        if ( readStartByBaseOfCigar >= cigar.getReadLength() ) throw new IllegalArgumentException("readStartByBaseOfCigar " + readStartByBaseOfCigar + " must be <= readLength " + cigar.getReadLength());

        int hapOffset = 0, refOffset = 0;
        for ( final CigarElement ce : cigar.getCigarElements() ) {
            for ( int i = 0; i < ce.getLength(); i++ ) {
                switch ( ce.getOperator() ) {
                    case M:case EQ:case X:
                        if ( hapOffset >= readStartByBaseOfCigar )
                            return refOffset;
                        hapOffset++;
                        refOffset++;
                        break;
                    case I: case S:
                        hapOffset++;
                        break;
                    case D:
                        refOffset++;
                        break;
                    default:
                        throw new IllegalStateException("calcFirstBaseMatchingReferenceInCigar does not support cigar " + ce.getOperator() + " in cigar " + cigar);
                }
            }
        }

        throw new IllegalStateException("Never found appropriate matching state for cigar " + cigar + " given start of " + readStartByBaseOfCigar);
    }

    /**
     * Generate a new Cigar that maps the operations of the first cigar through those in a second
     *
     * For example, if first is 5M and the second is 2M1I2M then the result is 2M1I2M.
     * However, if first is 1M2D3M and second is 2M1I3M this results in a cigar X
     *
     * ref   : AC-GTA
     * hap   : ACxGTA  - 2M1I3M
     * read  : A--GTA  - 1M2D3M
     * result: A--GTA => 1M1D3M
     *
     * ref   : ACxG-TA
     * hap   : AC-G-TA  - 2M1D3M
     * read  : AC-GxTA  - 3M1I2M
     * result: AC-GxTA => 2M1D1M1I2M
     *
     * ref   : ACGTA
     * hap   : ACGTA  - 5M
     * read  : A-GTA  - 1M1I3M
     * result: A-GTA => 1M1I3M
     *
     * ref   : ACGTAC
     * hap   : AC---C  - 2M3D1M
     * read  : AC---C  - 3M
     * result: AG---C => 2M3D
     *
     * The constraint here is that both cigars should imply that the result have the same number of
     * reference bases (i.e.g, cigar.getReferenceLength() are equals).
     *
     * @param firstToSecond the cigar mapping hap1 -> hap2
     * @param secondToThird the cigar mapping hap2 -> hap3
     * @return A cigar mapping hap1 -> hap3
     */
    public static Cigar applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird) {
        final boolean DEBUG = false;

        final List<CigarElement> newElements = new LinkedList<CigarElement>();
        final int nElements12 = firstToSecond.getCigarElements().size();
        final int nElements23 = secondToThird.getCigarElements().size();

        int cigar12I = 0, cigar23I = 0;
        int elt12I = 0, elt23I = 0;

        while ( cigar12I < nElements12 && cigar23I < nElements23 ) {
            final CigarElement elt12 = firstToSecond.getCigarElement(cigar12I);
            final CigarElement elt23 = secondToThird.getCigarElement(cigar23I);

            final CigarPairTransform transform = getTransformer(elt12.getOperator(), elt23.getOperator());

            if ( DEBUG )
                System.out.printf("Transform %s => %s with elt1 = %d %s @ %d elt2 = %d %s @ %d with transform %s%n",
                        firstToSecond, secondToThird, cigar12I, elt12.getOperator(), elt12I, cigar23I, elt23.getOperator(), elt23I, transform);

            if ( transform.op13 != null ) // skip no ops
                newElements.add(new CigarElement(1, transform.op13));

            elt12I += transform.advance12;
            elt23I += transform.advance23;

            // if have exhausted our current element, advance to the next one
            if ( elt12I == elt12.getLength() ) { cigar12I++; elt12I = 0; }
            if ( elt23I == elt23.getLength() ) { cigar23I++; elt23I = 0; }
        }

        return AlignmentUtils.consolidateCigar(new Cigar(newElements));
    }

    private static CigarPairTransform getTransformer(final CigarOperator op12, final CigarOperator op23) {
        for ( final CigarPairTransform transform : cigarPairTransformers) {
            if ( transform.op12.contains(op12) && transform.op23.contains(op23) )
                return transform;
        }

        throw new IllegalStateException("No transformer for operators " + op12 + " and " + op23);
    }

    /**
     * transformations that project one alignment state through another
     *
     * Think about this as a state machine, where we have:
     *
     * bases3 : xxx A zzz
     * bases2 : xxx B zzz
     * bases1 : xxx C zzz
     *
     * where A, B and C are alignment states of a three way alignment.  We want to capture
     * the transition from operation mapping 1 -> 2 and an operation mapping 2 -> 3 and its
     * associated mapping from 1 -> 3 and the advancement of the cigar states of 1->2 and 2->3.
     *
     * Imagine that A, B, and C are all equivalent (so that op12 = M and op23 = M).  This implies
     * a mapping of 1->3 of M, and in this case the next states to consider in the 3 way alignment
     * are the subsequent states in 1 and 2 (so that advance12 and advance23 are both 1).
     *
     * Obviously not all of the states and their associated transitions are so simple.  Suppose instead
     * that op12 = I, and op23 = M.  What does this look like:
     *
     * bases3 : xxx - A zzz
     * bases2 : xxx - B zzz
     * bases1 : xxx I C zzz
     *
     * It means that op13 must be an insertion (as we have an extra base in 1 thats not present in 2 and
     * so not present in 3).  We advance the cigar in 1 by 1 (as we've consumed one base in 1 for the I)
     * but we haven't yet found the base corresponding to the M of op23.  So we don't advance23.
     */
    private static class CigarPairTransform {
        private final EnumSet<CigarOperator> op12, op23;
        private final CigarOperator op13;
        private final int advance12, advance23;

        private CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23) {
            this.op12 = getCigarSet(op12);
            this.op23 = getCigarSet(op23);
            this.op13 = op13;
            this.advance12 = advance12;
            this.advance23 = advance23;
        }

        private static EnumSet<CigarOperator> getCigarSet(final CigarOperator masterOp) {
            switch ( masterOp ) {
                case M: return EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
                case I: return EnumSet.of(CigarOperator.I, CigarOperator.S);
                case D: return EnumSet.of(CigarOperator.D);
                default: throw new IllegalStateException("Unexpected state " + masterOp);
            }
        }

        @Override
        public String toString() {
            return "CigarPairTransform{" +
                    "op12=" + op12 +
                    ", op23=" + op23 +
                    ", op13=" + op13 +
                    ", advance12=" + advance12 +
                    ", advance23=" + advance23 +
                    '}';
        }
    }


    private final static List<CigarPairTransform> cigarPairTransformers = Arrays.asList(
            //
            // op12 is a match
            //
            // 3: xxx B yyy
            // ^^^^^^^^^^^^
            // 2: xxx M yyy
            // 1: xxx M yyy
            new CigarPairTransform(CigarOperator.M, CigarOperator.M, CigarOperator.M, 1, 1),
            // 3: xxx I yyy
            // ^^^^^^^^^^^^
            // 2: xxx I yyy
            // 1: xxx M yyy
            new CigarPairTransform(CigarOperator.M, CigarOperator.I, CigarOperator.I, 1, 1),
            // 3: xxx D yyy
            // ^^^^^^^^^^^^
            // 2: xxx D yyy
            // 1: xxx M yyy
            new CigarPairTransform(CigarOperator.M, CigarOperator.D, CigarOperator.D, 0, 1),

            //
            // op12 is a deletion
            //
            // 3: xxx D M yyy
            // ^^^^^^^^^^^^
            // 2: xxx M yyy
            // 1: xxx D yyy
            new CigarPairTransform(CigarOperator.D, CigarOperator.M, CigarOperator.D, 1, 1),
            // 3: xxx D1 D2 yyy
            // ^^^^^^^^^^^^
            // 2: xxx D2 yyy
            // 1: xxx D1 yyy
            new CigarPairTransform(CigarOperator.D, CigarOperator.D, CigarOperator.D, 1, 0),
            // 3: xxx X yyy => no-op, we skip emitting anything here
            // ^^^^^^^^^^^^
            // 2: xxx I yyy
            // 1: xxx D yyy
            new CigarPairTransform(CigarOperator.D, CigarOperator.I, null, 1, 1),

            //
            // op12 is a insertion
            //
            // 3: xxx I M yyy
            // ^^^^^^^^^^^^
            // 2: xxx M yyy
            // 1: xxx I yyy
            new CigarPairTransform(CigarOperator.I, CigarOperator.M, CigarOperator.I, 1, 0),
            // 3: xxx I D yyy
            // ^^^^^^^^^^^^
            // 2: xxx D yyy
            // 1: xxx I yyy
            new CigarPairTransform(CigarOperator.I, CigarOperator.D, CigarOperator.I, 1, 0),
            // 3: xxx I1 I2 yyy
            // ^^^^^^^^^^^^
            // 2: xxx I2 yyy
            // 1: xxx I1 yyy
            new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0)
            );
}
TOP

Related Classes of org.broadinstitute.gatk.utils.sam.AlignmentUtils$CigarPairTransform

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.