Package org.broadinstitute.gatk.utils.locusiterator

Source Code of org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.locusiterator;

import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.ReadProperties;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.downsampling.DownsampleType;
import org.broadinstitute.gatk.engine.iterators.GATKSAMRecordIterator;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;

import java.util.*;

/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*
* Produces AlignmentContext objects, that contain ReadBackedPileups of PileupElements.  This
* class has its core job of converting an iterator of ordered SAMRecords into those
* RBPs.
*
* There are a few constraints on required and ensured by LIBS:
*
* -- Requires the Iterator<GATKSAMRecord> to returns reads in coordinate sorted order, consistent with the ordering
* defined by the SAM file format.  That that for performance reasons this constraint isn't actually enforced.
* The behavior of LIBS is undefined in the case where the reads are badly ordered.
* -- The reads in the ReadBackedPileup are themselves in the order of appearance of the reads from the iterator.
* That is, the pileup is ordered in a way consistent with the SAM coordinate ordering
* -- Only aligned reads with at least one on-genomic cigar operator are passed on in the pileups.  That is,
* unmapped reads or reads that are all insertions (10I) or soft clipped (10S) are not passed on.
* -- LIBS can perform per-sample downsampling of a variety of kinds.
* -- Because of downsampling there's no guarantee that:
*   -- A read that could be aligned to a position will actually occur in the pileup (downsampled away)
*   -- A read that appears in a previous pileup that could align to a future position will actually occur
*      in that pileup.  That is, a read might show up at position i but be downsampled away in the pileup at j
* -- LIBS can optionally capture all of the reads that come off the iterator, before any leveling downsampling
* occurs, if requested.  This allows users of LIBS to see both a ReadBackedPileup view of the data as well as
* a stream of unique, sorted reads
*/
public final class LocusIteratorByState extends LocusIterator {
    /** Indicates that we shouldn't do any downsampling */
    public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1);

    /**
     * our log, which we want to capture anything from this class
     */
    private final static Logger logger = Logger.getLogger(LocusIteratorByState.class);

    // -----------------------------------------------------------------------------------------------------------------
    //
    // member fields
    //
    // -----------------------------------------------------------------------------------------------------------------

    /**
     * Used to create new GenomeLocs as needed
     */
    private final GenomeLocParser genomeLocParser;

    /**
     * A complete list of all samples that may come out of the reads.  Must be
     * comprehensive.
     */
    private final ArrayList<String> samples;

    /**
     * The system that maps incoming reads from the iterator to their pileup states
     */
    private final ReadStateManager readStates;

    /**
     * Should we include reads in the pileup which are aligned with a deletion operator to the reference?
     */
    private final boolean includeReadsWithDeletionAtLoci;

    /**
     * The next alignment context.  A non-null value means that a
     * context is waiting from hasNext() for sending off to the next next() call.  A null
     * value means that either hasNext() has not been called at all or that
     * the underlying iterator is exhausted
     */
    private AlignmentContext nextAlignmentContext;

    // -----------------------------------------------------------------------------------------------------------------
    //
    // constructors and other basic operations
    //
    // -----------------------------------------------------------------------------------------------------------------

    /**
     * Create a new LocusIteratorByState
     *
     * @param samIterator the iterator of reads to process into pileups.  Reads must be ordered
     *                    according to standard coordinate-sorted BAM conventions
     * @param readInformation meta-information about how to process the reads (i.e., should we do downsampling?)
     * @param genomeLocParser used to create genome locs
     * @param samples a complete list of samples present in the read groups for the reads coming from samIterator.
     *                This is generally just the set of read group sample fields in the SAMFileHeader.  This
     *                list of samples may contain a null element, and all reads without read groups will
     *                be mapped to this null sample
     */
    public LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
                                final ReadProperties readInformation,
                                final GenomeLocParser genomeLocParser,
                                final Collection<String> samples) {
        this(samIterator,
                toDownsamplingInfo(readInformation),
                readInformation.includeReadsWithDeletionAtLoci(),
                genomeLocParser,
                samples,
                readInformation.keepUniqueReadListInLIBS());
    }

    /**
     * Create a new LocusIteratorByState based on a SAMFileReader using reads in an iterator it
     *
     * Simple constructor that uses the samples in the reader, doesn't do any downsampling,
     * and makes a new GenomeLocParser using the reader.  This constructor will be slow(ish)
     * if you continually invoke this constructor, but it's easy to make.
     *
     * @param reader a non-null reader
     * @param it an iterator from reader that has the reads we want to use to create ReadBackPileups
     */
    public LocusIteratorByState(final SAMFileReader reader, final CloseableIterator<SAMRecord> it) {
        this(new GATKSAMRecordIterator(it),
                new LIBSDownsamplingInfo(false, 0),
                true,
                new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()),
                SampleUtils.getSAMFileSamples(reader.getFileHeader()),
                false);
    }

    /**
     * Create a new LocusIteratorByState
     *
     * @param samIterator the iterator of reads to process into pileups.  Reads must be ordered
     *                    according to standard coordinate-sorted BAM conventions
     * @param downsamplingInfo meta-information about how to downsampling the reads
     * @param genomeLocParser used to create genome locs
     * @param samples a complete list of samples present in the read groups for the reads coming from samIterator.
     *                This is generally just the set of read group sample fields in the SAMFileHeader.  This
     *                list of samples may contain a null element, and all reads without read groups will
     *                be mapped to this null sample
     * @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them
     *                                available via the transferReadsFromAllPreviousPileups interface
     */
    public LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
                                final LIBSDownsamplingInfo downsamplingInfo,
                                final boolean includeReadsWithDeletionAtLoci,
                                final GenomeLocParser genomeLocParser,
                                final Collection<String> samples,
                                final boolean maintainUniqueReadsList) {
        if ( samIterator == null ) throw new IllegalArgumentException("samIterator cannot be null");
        if ( downsamplingInfo == null ) throw new IllegalArgumentException("downsamplingInfo cannot be null");
        if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
        if ( samples == null ) throw new IllegalArgumentException("Samples cannot be null");

        // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
        // there's no read data.  So we need to throw this error only when samIterator.hasNext() is true
        if (samples.isEmpty() && samIterator.hasNext()) {
            throw new IllegalArgumentException("samples list must not be empty");
        }

        this.genomeLocParser = genomeLocParser;
        this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
        this.samples = new ArrayList<String>(samples);
        this.readStates = new ReadStateManager(samIterator, this.samples, downsamplingInfo, maintainUniqueReadsList);
    }

    @Override
    public Iterator<AlignmentContext> iterator() {
        return this;
    }

    /**
     * Get the current location (i.e., the bp of the center of the pileup) of the pileup, or null if not anywhere yet
     *
     * Assumes that read states is updated to reflect the current pileup position, but not advanced to the
     * next location.
     *
     * @return the location of the current pileup, or null if we're after all reads
     */
    private GenomeLoc getLocation() {
        return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
    }

    // -----------------------------------------------------------------------------------------------------------------
    //
    // next() routine and associated collection operations
    //
    // -----------------------------------------------------------------------------------------------------------------

    /**
     * Is there another pileup available?
     * @return
     */
    @Override
    public boolean hasNext() {
        lazyLoadNextAlignmentContext();
        return nextAlignmentContext != null;
    }

    /**
     * Get the next AlignmentContext available from the reads.
     *
     * @return a non-null AlignmentContext of the pileup after to the next genomic position covered by
     * at least one read.
     */
    @Override
    public AlignmentContext next() {
        lazyLoadNextAlignmentContext();
        if (!hasNext())
            throw new NoSuchElementException("LocusIteratorByState: out of elements.");
        AlignmentContext currentAlignmentContext = nextAlignmentContext;
        nextAlignmentContext = null;
        return currentAlignmentContext;
    }

    /**
     * Move this LIBS until we are over position
     *
     * Will return null if cannot reach position (because we run out of data in the locus)
     *
     * @param position the start position of the AlignmentContext we want back
     * @param stopAtFirstNonEmptySiteAfterPosition if true, we will stop as soon as we find a context with data with
     *                                             position >= position, otherwise we will return a null value
     *                                             and consume the data for the next position.  This means that without
     *                                             specifying this value the LIBS will be in an indeterminate state
     *                                             after calling this function, and should be reconstructed from scratch
     *                                             for subsequent use
     * @return a AlignmentContext at position, or null if this isn't possible
     */
    public AlignmentContext advanceToLocus(final int position, final boolean stopAtFirstNonEmptySiteAfterPosition) {
        while ( hasNext() ) {
            final AlignmentContext context = next();

            if ( context == null )
                // we ran out of data
                return null;

            if ( context.getPosition() == position )
                return context;

            if ( context.getPosition() > position)
                return stopAtFirstNonEmptySiteAfterPosition ? context : null;
        }

        return null;
    }

    /**
     * Creates the next alignment context from the given state.  Note that this is implemented as a
     * lazy load method. nextAlignmentContext MUST BE null in order for this method to advance to the
     * next entry.
     */
    private void lazyLoadNextAlignmentContext() {
        while (nextAlignmentContext == null && readStates.hasNext()) {
            readStates.collectPendingReads();

            final GenomeLoc location = getLocation();
            final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();

            for (final Map.Entry<String, PerSampleReadStateManager> sampleStatePair : readStates ) {
                final String sample = sampleStatePair.getKey();
                final PerSampleReadStateManager readState = sampleStatePair.getValue();
                final Iterator<AlignmentStateMachine> iterator = readState.iterator();
                final List<PileupElement> pile = new ArrayList<PileupElement>(readState.size());

                while (iterator.hasNext()) {
                    // state object with the read/offset information
                    final AlignmentStateMachine state = iterator.next();
                    final GATKSAMRecord read = state.getRead();
                    final CigarOperator op = state.getCigarOperator();

                    if (op == CigarOperator.N) // N's are never added to any pileup
                        continue;

                    if (!dontIncludeReadInPileup(read, location.getStart())) {
                        if ( ! includeReadsWithDeletionAtLoci && op == CigarOperator.D ) {
                            continue;
                        }

                        pile.add(state.makePileupElement());
                    }
                }

                if (! pile.isEmpty() ) // if this pileup added at least one base, add it to the full pileup
                    fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
            }

            readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location
            if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
                nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false);
        }
    }

    // -----------------------------------------------------------------------------------------------------------------
    //
    // getting the list of reads
    //
    // -----------------------------------------------------------------------------------------------------------------

    /**
     * Transfer current list of all unique reads that have ever been used in any pileup, clearing old list
     *
     * This list is guaranteed to only contain unique reads, even across calls to the this function.  It is
     * literally the unique set of reads ever seen.
     *
     * The list occurs in the same order as they are encountered in the underlying iterator.
     *
     * Takes the maintained list of submitted reads, and transfers it to the caller of this
     * function.  The old list of set to a new, cleanly allocated list so the caller officially
     * owns the list returned by this call.  This is the only way to clear the tracking
     * of submitted reads, if enabled.
     *
     * The purpose of this function is allow users of LIBS to keep track of all of the reads pulled off the
     * underlying GATKSAMRecord iterator and that appeared at any point in the list of SAMRecordAlignmentState for
     * any reads.  This function is intended to allow users to efficiently reconstruct the unique set of reads
     * used across all pileups.  This is necessary for LIBS to handle because attempting to do
     * so from the pileups coming out of LIBS is extremely expensive.
     *
     * This functionality is only available if LIBS was created with the argument to track the reads
     *
     * @throws UnsupportedOperationException if called when keepingSubmittedReads is false
     *
     * @return the current list
     */
    @Ensures("result != null")
    public List<GATKSAMRecord> transferReadsFromAllPreviousPileups() {
        return readStates.transferSubmittedReads();
    }

    /**
     * Get the underlying list of tracked reads.  For testing only
     * @return a non-null list
     */
    @Ensures("result != null")
    protected List<GATKSAMRecord> getReadsFromAllPreviousPileups() {
        return readStates.getSubmittedReads();
    }

    // -----------------------------------------------------------------------------------------------------------------
    //
    // utility functions
    //
    // -----------------------------------------------------------------------------------------------------------------

    /**
     * Should this read be excluded from the pileup?
     *
     * Generic place to put per-base filters appropriate to LocusIteratorByState
     *
     * @param rec the read to potentially exclude
     * @param pos the genomic position of the current alignment
     * @return true if the read should be excluded from the pileup, false otherwise
     */
    @Requires({"rec != null", "pos > 0"})
    private boolean dontIncludeReadInPileup(final GATKSAMRecord rec, final long pos) {
        return ReadUtils.isBaseInsideAdaptor(rec, pos);
    }

    /**
     * Create a LIBSDownsamplingInfo object from the requested info in ReadProperties
     *
     * LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
     * downsampling to coverage by sample. SAMDataSource will have refrained from applying
     * any downsamplers to the read stream in this case, in the expectation that LIBS will
     * manage the downsampling. The reason for this is twofold: performance (don't have to
     * split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
     * of reads (eg., using half of a read, and throwing the rest away).
     *
     * @param readInfo GATK engine information about what should be done to the reads
     * @return a LIBS specific info holder about downsampling only
     */
    @Requires("readInfo != null")
    @Ensures("result != null")
    private static LIBSDownsamplingInfo toDownsamplingInfo(final ReadProperties readInfo) {
        final boolean performDownsampling = readInfo.getDownsamplingMethod() != null &&
                readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
                readInfo.getDownsamplingMethod().toCoverage != null;
        final int coverage = performDownsampling ? readInfo.getDownsamplingMethod().toCoverage : 0;

        return new LIBSDownsamplingInfo(performDownsampling, coverage);
    }

    /**
     * Create a pileup element for read at offset
     *
     * offset must correspond to a valid read offset given the read's cigar, or an IllegalStateException will be throw
     *
     * @param read a read
     * @param offset the offset into the bases we'd like to use in the pileup
     * @return a valid PileupElement with read and at offset
     */
    @Ensures("result != null")
    public static PileupElement createPileupForReadAndOffset(final GATKSAMRecord read, final int offset) {
        if ( read == null ) throw new IllegalArgumentException("read cannot be null");
        if ( offset < 0 || offset >= read.getReadLength() ) throw new IllegalArgumentException("Invalid offset " + offset + " outside of bounds 0 and " + read.getReadLength());

        final AlignmentStateMachine stateMachine = new AlignmentStateMachine(read);

        while ( stateMachine.stepForwardOnGenome() != null ) {
            if ( stateMachine.getReadOffset() == offset )
                return stateMachine.makePileupElement();
        }

        throw new IllegalStateException("Tried to create a pileup for read " + read + " with offset " + offset +
                " but we never saw such an offset in the alignment state machine");
    }

    /**
     * For testing only.  Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
     * for the system.
     */
    public static List<String> sampleListForSAMWithoutReadGroups() {
        List<String> samples = new ArrayList<String>();
        samples.add(null);
        return samples;
    }
}
TOP

Related Classes of org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.