Package org.broadinstitute.gatk.utils.haplotype

Source Code of org.broadinstitute.gatk.utils.haplotype.EventMap$VariantContextComparator

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.haplotype;

import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;

import java.util.*;

/**
* Extract simple VariantContext events from a single haplotype
*
* User: depristo
* Date: 3/27/13
* Time: 8:35 AM
*/
public class EventMap extends TreeMap<Integer, VariantContext> {
    private final static Logger logger = Logger.getLogger(EventMap.class);
    protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3;
    private static final int MAX_EVENTS_PER_HAPLOTYPE = 3;
    private static final int MAX_INDELS_PER_HAPLOTYPE = 2;
    public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);

    private final Haplotype haplotype;
    private final byte[] ref;
    private final GenomeLoc refLoc;
    private final String sourceNameToAdd;

    public EventMap(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) {
        super();
        this.haplotype = haplotype;
        this.ref = ref;
        this.refLoc = refLoc;
        this.sourceNameToAdd = sourceNameToAdd;

        processCigarForInitialEvents();
    }

    /**
     * For testing.  Let's you set up a explicit configuration without having to process a haplotype and reference
     * @param stateForTesting
     */
    public EventMap(final Collection<VariantContext> stateForTesting) {
        haplotype = null;
        ref = null;
        refLoc = null;
        sourceNameToAdd = null;
        for ( final VariantContext vc : stateForTesting )
            addVC(vc);
    }

    protected void processCigarForInitialEvents() {
        final Cigar cigar = haplotype.getCigar();
        final byte[] alignment = haplotype.getBases();

        int refPos = haplotype.getAlignmentStartHapwrtRef();
        if( refPos < 0 ) {
            return;
        } // Protection against SW failures

        final List<VariantContext> proposedEvents = new ArrayList<>();

        int alignmentPos = 0;

        for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
            final CigarElement ce = cigar.getCigarElement(cigarIndex);
            final int elementLength = ce.getLength();
            switch( ce.getOperator() ) {
                case I:
                {
                    if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
                        final List<Allele> insertionAlleles = new ArrayList<Allele>();
                        final int insertionStart = refLoc.getStart() + refPos - 1;
                        final byte refByte = ref[refPos-1];
                        if( BaseUtils.isRegularBase(refByte) ) {
                            insertionAlleles.add( Allele.create(refByte, true) );
                        }
                        if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) {
                            // if the insertion isn't completely resolved in the haplotype, skip it
                            // note this used to emit SYMBOLIC_UNASSEMBLED_EVENT_ALLELE but that seems dangerous
                        } else {
                            byte[] insertionBases = new byte[]{};
                            insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base
                            insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength));
                            if( BaseUtils.isAllRegularBases(insertionBases) ) {
                                insertionAlleles.add( Allele.create(insertionBases, false) );
                            }
                        }
                        if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
                            proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
                        }
                    }
                    alignmentPos += elementLength;
                    break;
                }
                case S:
                {
                    alignmentPos += elementLength;
                    break;
                }
                case D:
                {
                    if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
                        final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength )// add padding base
                        final List<Allele> deletionAlleles = new ArrayList<Allele>();
                        final int deletionStart = refLoc.getStart() + refPos - 1;
                        final byte refByte = ref[refPos-1];
                        if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
                            deletionAlleles.add( Allele.create(deletionBases, true) );
                            deletionAlleles.add( Allele.create(refByte, false) );
                            proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
                        }
                    }
                    refPos += elementLength;
                    break;
                }
                case M:
                case EQ:
                case X:
                {
                    for( int iii = 0; iii < elementLength; iii++ ) {
                        final byte refByte = ref[refPos];
                        final byte altByte = alignment[alignmentPos];
                        if( refByte != altByte ) { // SNP!
                            if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
                                final List<Allele> snpAlleles = new ArrayList<Allele>();
                                snpAlleles.add( Allele.create( refByte, true ) );
                                snpAlleles.add( Allele.create( altByte, false ) );
                                proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
                            }
                        }
                        refPos++;
                        alignmentPos++;
                    }
                    break;
                }
                case N:
                case H:
                case P:
                default:
                    throw new ReviewedGATKException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
            }
        }

        for ( final VariantContext proposedEvent : proposedEvents )
            addVC(proposedEvent, true);
    }

    /**
     * Add VariantContext vc to this map, merging events with the same start sites if necessary
     * @param vc the variant context to add
     */
    protected void addVC(final VariantContext vc) {
        addVC(vc, true);
    }

    /**
     * Add VariantContext vc to this map
     * @param vc the variant context to add
     * @param merge should we attempt to merge it with an already existing element, or should we throw an error in that case?
     */
    protected void addVC(final VariantContext vc, final boolean merge) {
        if ( vc == null ) throw new IllegalArgumentException("vc cannot be null");

        if ( containsKey(vc.getStart()) ) {
            if ( merge ) {
                final VariantContext prev = get(vc.getStart());
                put(vc.getStart(), makeBlock(prev, vc));
            } else {
                throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc);
            }
        } else
            put(vc.getStart(), vc);
    }

    /**
     * Create a block substitution out of two variant contexts that start at the same position
     *
     * vc1 can be SNP, and vc2 can then be either a insertion or deletion.
     * If vc1 is an indel, then vc2 must be the opposite type (vc1 deletion => vc2 must be an insertion)
     *
     * @param vc1 the first variant context we want to merge
     * @param vc2 the second
     * @return a block substitution that represents the composite substitution implied by vc1 and vc2
     */
    protected VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) {
        if ( vc1.getStart() != vc2.getStart() )  throw new IllegalArgumentException("vc1 and 2 must have the same start but got " + vc1 + " and " + vc2);
        if ( ! vc1.isBiallelic() ) throw new IllegalArgumentException("vc1 must be biallelic");
        if ( ! vc1.isSNP() ) {
            if ( ! ((vc1.isSimpleDeletion() && vc2.isSimpleInsertion()) || (vc1.isSimpleInsertion() && vc2.isSimpleDeletion())))
                throw new IllegalArgumentException("Can only merge single insertion with deletion (or vice versa) but got " + vc1 + " merging with " + vc2);
        } else if ( vc2.isSNP() ) {
            throw new IllegalArgumentException("vc1 is " + vc1 + " but vc2 is a SNP, which implies there's been some terrible bug in the cigar " + vc2);
        }

        final Allele ref, alt;
        final VariantContextBuilder b = new VariantContextBuilder(vc1);
        if ( vc1.isSNP() ) {
            // we have to repair the first base, so SNP case is special cased
            if ( vc1.getReference().equals(vc2.getReference()) ) {
                // we've got an insertion, so we just update the alt to have the prev alt
                ref = vc1.getReference();
                alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false);
            } else {
                // we're dealing with a deletion, so we patch the ref
                ref = vc2.getReference();
                alt = vc1.getAlternateAllele(0);
                b.stop(vc2.getEnd());
            }
        } else {
            final VariantContext insertion = vc1.isSimpleInsertion() ? vc1 : vc2;
            final VariantContext deletion  = vc1.isSimpleInsertion() ? vc2 : vc1;
            ref = deletion.getReference();
            alt = insertion.getAlternateAllele(0);
            b.stop(deletion.getEnd());
        }

        return b.alleles(Arrays.asList(ref, alt)).make();
    }

    // TODO -- warning this is an O(N^3) algorithm because I'm just lazy.  If it's valuable we need to reengineer it
    @Requires("getNumberOfEvents() > 0")
    protected void replaceClumpedEventsWithBlockSubstitutions() {
        if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) {
            int lastStart = -1;
            for ( boolean foundOne = true; foundOne; ) {
                foundOne = false;
                for ( final VariantContext vc : getVariantContexts() ) {
                    if ( vc.getStart() > lastStart ) {
                        lastStart = vc.getStart();
                        final List<VariantContext> neighborhood = getNeighborhood(vc, 10);
                        if ( updateToBlockSubstitutionIfBetter(neighborhood) ) {
                            foundOne = true;
                            break;
                        }
                    }
                }
            }
        }
    }

    protected boolean updateToBlockSubstitutionIfBetter(final List<VariantContext> neighbors) {
        if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION)
            return false;
        // TODO -- need more tests to decide if this is really so good

        final VariantContext first = neighbors.get(0);
        final int refStartOffset = first.getStart() - refLoc.getStart();
        final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart();

        final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1);
        final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar());

        final VariantContextBuilder builder = new VariantContextBuilder(first);
        builder.stop(first.getStart() + refBases.length - 1);
        builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases)));
        final VariantContext block = builder.make();

        // remove all merged events
        for ( final VariantContext merged : neighbors ) {
            if ( remove(merged.getStart()) == null )
                throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged);
        }

        // note must be after we remove the previous events as the treeset only allows one key per start
        logger.info("Transforming into block substitution at " + block);
        addVC(block, false);

        return true;
    }

    /**
     * Get all of the variant contexts starting at leftMost that are within maxBP of each other
     *
     * @param leftMost the left most (smallest position) variant context that will start the neighborhood
     * @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next
     *                           to be included the the resulting list
     * @return a list that contains at least one element (leftMost)
     */
    @Requires({"leftMost != null", "maxBPBetweenEvents >= 0"})
    @Ensures({"result != null", "! result.isEmpty()"})
    protected List<VariantContext> getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) {
        final List<VariantContext> neighbors = new LinkedList<VariantContext>();

        VariantContext left = leftMost;
        for ( final VariantContext vc : getVariantContexts() ) {
            if ( vc.getStart() < leftMost.getStart() )
                continue;

            if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) {
                // this vc is within max distance to the end of the left event, so accumulate it
                neighbors.add(vc);
                left = vc;
            }
        }

        return neighbors;
    }

    /**
     * Get the starting positions of events in this event map
     * @return
     */
    public Set<Integer> getStartPositions() {
        return keySet();
    }

    /**
     * Get the variant contexts in order of start position in this event map
     * @return
     */
    public Collection<VariantContext> getVariantContexts() {
        return values();
    }

    /**
     * How many events do we have?
     * @return
     */
    public int getNumberOfEvents() {
        return size();
    }

    @Override
    public String toString() {
        final StringBuilder b = new StringBuilder("EventMap{");
        for ( final VariantContext vc : getVariantContexts() )
            b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles()));
        b.append("}");
        return b.toString();
    }

    /**
     * Build event maps for each haplotype, returning the sorted set of all of the starting positions of all
     * events across all haplotypes
     *
     * @param haplotypes a list of haplotypes
     * @param ref the reference bases
     * @param refLoc the span of the reference bases
     * @param debug if true, we'll emit debugging information during this operation
     * @return a sorted set of start positions of all events among all haplotypes
     */
    public static TreeSet<Integer> buildEventMapsForHaplotypes( final List<Haplotype> haplotypes,
                                                                final byte[] ref,
                                                                final GenomeLoc refLoc,
                                                                final boolean debug) {
        // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
        final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
        int hapNumber = 0;

        if( debug ) logger.info("=== Best Haplotypes ===");
        for( final Haplotype h : haplotypes ) {
            // Walk along the alignment and turn any difference from the reference into an event
            h.setEventMap( new EventMap( h, ref, refLoc, "HC" + hapNumber++ ) );
            startPosKeySet.addAll(h.getEventMap().getStartPositions());

            if( debug ) {
                logger.info(h.toString());
                logger.info("> Cigar = " + h.getCigar());
                logger.info(">> Events = " + h.getEventMap());
            }
        }

        return startPosKeySet;
    }

    private static class VariantContextComparator implements Comparator<VariantContext> {
        @Override
        public int compare(VariantContext vc1, VariantContext vc2) {
            return vc1.getStart() - vc2.getStart();
        }
    }

    /**
     * Get all of the VariantContexts in the event maps for all haplotypes, sorted by their start position
     * @param haplotypes the set of haplotypes to grab the VCs from
     * @return a sorted set of variant contexts
     */
    public static TreeSet<VariantContext> getAllVariantContexts( final List<Haplotype> haplotypes ) {
        // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
        final TreeSet<VariantContext> vcs = new TreeSet<VariantContext>(new VariantContextComparator());

        for( final Haplotype h : haplotypes ) {
            vcs.addAll(h.getEventMap().getVariantContexts());
        }

        return vcs;
    }
}
TOP

Related Classes of org.broadinstitute.gatk.utils.haplotype.EventMap$VariantContextComparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.