Package picard.analysis

Source Code of picard.analysis.CollectGcBiasMetrics$CalculateGcState

/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package picard.analysis;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.QualityUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
import picard.cmdline.StandardOptionDefinitions;
import picard.util.RExecutor;

import java.io.File;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.List;

/**
* Tool to collect information about GC bias in the reads in a given BAM file. Computes
* the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%
* and counts the number of read starts in each GC bin.  What is output and plotted is
* the "normalized coverage" in each bin - i.e. the number of reads per window normalized
* to the average number of reads per window across the whole genome.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
        usage = "Tool to collect information about GC bias in the reads in a given BAM file. Computes" +
                " the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%" +
                " and counts the number of read starts in each GC bin.  What is output and plotted is" +
                " the \"normalized coverage\" in each bin - i.e. the number of reads per window normalized" +
                " to the average number of reads per window across the whole genome..\n",
        usageShort = "Collects information about GC bias in the reads in the provided SAM or BAM",
        programGroup = Metrics.class
)
public class CollectGcBiasMetrics extends CommandLineProgram {
    /** The location of the R script to do the plotting. */
    private static final String R_SCRIPT = "picard/analysis/gcBias.R";

    // Usage and parameters

    @Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc="The reference sequence fasta file.")
    public File REFERENCE_SEQUENCE;

    @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The BAM or SAM file containing aligned reads.  " +
            "Must be coordinate-sorted.")
    public File INPUT;

    @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The text file to write the metrics table to.")
    public File OUTPUT;

    @Option(shortName="CHART", doc="The PDF file to render the chart to.")
    public File CHART_OUTPUT;

    @Option(shortName="S", doc="The text file to write summary metrics to.", optional=true)
    public File SUMMARY_OUTPUT;

    @Option(doc="The size of windows on the genome that are used to bin reads.")
    public int WINDOW_SIZE = 100;

    @Option(doc="For summary metrics, exclude GC windows that include less than this fraction of the genome.")
    public double MINIMUM_GENOME_FRACTION = 0.00001;

    @Option(doc="If true, assume that the input file is coordinate sorted, even if the header says otherwise.",
            shortName=StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME)
    public boolean ASSUME_SORTED = false;

    @Option(shortName="BS", doc="Whether the SAM or BAM file consists of bisulfite sequenced reads.  ")
    public boolean IS_BISULFITE_SEQUENCED = false;

    private static final Log log = Log.getInstance(CollectGcBiasMetrics.class);

    // Used to keep track of the total clusters as this is kinda important for bias
    private int totalClusters = 0;
    private int totalAlignedReads = 0;

    /** Stock main method. */
    public static void main(final String[] args) {
        System.exit(new CollectGcBiasMetrics().instanceMain(args));
    }

    protected int doWork() {
        IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
        IOUtil.assertFileIsReadable(INPUT);
        IOUtil.assertFileIsWritable(OUTPUT);
        IOUtil.assertFileIsWritable(CHART_OUTPUT);
        if (SUMMARY_OUTPUT != null) IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);

        // Histograms to track the number of windows at each GC, and the number of read starts
        // at windows of each GC
        final int[] windowsByGc = new int[101];
        final int[] readsByGc   = new int[101];
        final long[] basesByGc  = new long[101];
        final long[] errorsByGc = new long[101];

        final SAMFileReader sam = new SAMFileReader(INPUT);

        if (!ASSUME_SORTED && sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new PicardException("Header of input file " + INPUT.getAbsolutePath() + " indicates that it is not coordinate sorted.  " +
            "If you believe the records are in coordinate order, pass option ASSUME_SORTED=true.  If not, sort the file with SortSam.");
        }
        final PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(sam.iterator());
        final ReferenceSequenceFile referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);

        {
            // Check that the sequence dictionaries match if present
            final SAMSequenceDictionary referenceDictionary= referenceFile.getSequenceDictionary();
            final SAMSequenceDictionary samFileDictionary = sam.getFileHeader().getSequenceDictionary();
            if (referenceDictionary != null && samFileDictionary != null) {
                SequenceUtil.assertSequenceDictionariesEqual(referenceDictionary, samFileDictionary);
            }
        }

        ////////////////////////////////////////////////////////////////////////////
        // Loop over the reference and the reads and calculate the basic metrics
        ////////////////////////////////////////////////////////////////////////////
        ReferenceSequence ref = null;
        final ProgressLogger
                progress = new ProgressLogger(log);
        while ((ref = referenceFile.nextSequence()) != null) {
            final byte[] refBases = ref.getBases();
            StringUtil.toUpperCase(refBases);
            final int refLength = refBases.length;
            final int lastWindowStart = refLength - WINDOW_SIZE;

            final byte[] gc = calculateAllGcs(refBases, windowsByGc, lastWindowStart);
            while (iterator.hasNext() && iterator.peek().getReferenceIndex() == ref.getContigIndex()) {
                final SAMRecord rec = iterator.next();

                if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++this.totalClusters;

                if (!rec.getReadUnmappedFlag()) {
                    final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - WINDOW_SIZE : rec.getAlignmentStart();
                    ++this.totalAlignedReads;

                    if (pos > 0) {
                        final int windowGc = gc[pos];

                        if (windowGc >= 0) {
                            ++readsByGc[windowGc];
                            basesByGc[windowGc+= rec.getReadLength();
                            errorsByGc[windowGc] +=
                                    SequenceUtil.countMismatches(rec, refBases, IS_BISULFITE_SEQUENCED) +
                                            SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec);
                        }
                    }
                }

                progress.record(rec);
            }

            log.info("Processed reference sequence: " + ref.getName());
        }

        // Finish up the reads, presumably all unaligned
        while (iterator.hasNext()) {
            final SAMRecord rec = iterator.next();
            if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++this.totalClusters;

        }

        /////////////////////////////////////////////////////////////////////////////
        // Synthesize the normalized coverage metrics and write it all out to a file
        /////////////////////////////////////////////////////////////////////////////
        final MetricsFile<GcBiasDetailMetrics,?> metricsFile = getMetricsFile();
        final double totalWindows = sum(windowsByGc);
        final double totalReads   = sum(readsByGc);
        final double meanReadsPerWindow = totalReads / totalWindows;
        final double minimumWindowsToCountInSummary = totalWindows * this.MINIMUM_GENOME_FRACTION;

        for (int i=0; i<windowsByGc.length; ++i) {
            if (windowsByGc[i] == 0) continue;

            GcBiasDetailMetrics m = new GcBiasDetailMetrics();
            m.GC = i;
            m.WINDOWS             = windowsByGc[i];
            m.READ_STARTS         = readsByGc[i];
            if (errorsByGc[i] > 0) m.MEAN_BASE_QUALITY = QualityUtil.getPhredScoreFromObsAndErrors(basesByGc[i], errorsByGc[i]);
            m.NORMALIZED_COVERAGE = (m.READ_STARTS / (double) m.WINDOWS) / meanReadsPerWindow;
            m.ERROR_BAR_WIDTH     = (Math.sqrt(m.READ_STARTS) / (double) m.WINDOWS) / meanReadsPerWindow;

            metricsFile.addMetric(m);
        }

        metricsFile.write(OUTPUT);

        // Synthesize the high level metrics
        if (SUMMARY_OUTPUT != null) {
            final MetricsFile<GcBiasSummaryMetrics,?> summaryMetricsFile = getMetricsFile();
            final GcBiasSummaryMetrics summary = new GcBiasSummaryMetrics();
            summary.WINDOW_SIZE    = this.WINDOW_SIZE;
            summary.TOTAL_CLUSTERS = this.totalClusters;
            summary.ALIGNED_READS  = this.totalAlignedReads;
            calculateDropoutMetrics(metricsFile.getMetrics(), summary);

            summaryMetricsFile.addMetric(summary);
            summaryMetricsFile.write(SUMMARY_OUTPUT);
        }

        // Plot the results
        final NumberFormat fmt = NumberFormat.getIntegerInstance();
        fmt.setGroupingUsed(true);
        final String subtitle = "Total clusters: " + fmt.format(this.totalClusters) +
                                ", Aligned reads: " + fmt.format(this.totalAlignedReads);
        String title = INPUT.getName().replace(".duplicates_marked", "").replace(".aligned.bam", "");

        // Qualify the title with the library name iff it's for a single sample
        final List<SAMReadGroupRecord> readGroups = sam.getFileHeader().getReadGroups();
        if (readGroups.size() == 1) {
            title += "." + readGroups.get(0).getLibrary();
        }
        RExecutor.executeFromClasspath(R_SCRIPT,
                                       OUTPUT.getAbsolutePath(),
                                       CHART_OUTPUT.getAbsolutePath(),
                                       title,
                                       subtitle,
                                       String.valueOf(WINDOW_SIZE));

        return 0;
    }

    /** Sums the values in an int[]. */
    private double sum(final int[] values) {
        final int length = values.length;
        double total = 0;
        for (int i=0; i<length; ++i) {
            total += values[i];
        }

        return total;
    }

    /** Calculates the Illumina style AT and GC dropout numbers. */
    private void calculateDropoutMetrics(final Collection<GcBiasDetailMetrics> details,
                                         final GcBiasSummaryMetrics summary) {
        // First calculate the totals
        double totalReads   = 0;
        double totalWindows = 0;

        for (final GcBiasDetailMetrics detail : details) {
            totalReads += detail.READ_STARTS;
            totalWindows += detail.WINDOWS;
        }

        double atDropout = 0;
        double gcDropout = 0;

        for (final GcBiasDetailMetrics detail : details) {
            final double relativeReads   = detail.READ_STARTS / totalReads;
            final double relativeWindows = detail.WINDOWS / totalWindows;
            final double dropout = (relativeWindows - relativeReads) * 100;

            if (dropout > 0) {
                if (detail.GC <= 50) atDropout += dropout;
                if (detail.GC >= 50) gcDropout += dropout;
            }
        }

        summary.AT_DROPOUT = atDropout;
        summary.GC_DROPOUT = gcDropout;
    }

    /** Calculcate all the GC values for all windows. */
    private byte[] calculateAllGcs(final byte [] refBases, final int [] windowsByGc, final int lastWindowStart) {
        final int refLength = refBases.length;
        final byte[] gc = new byte[refLength + 1];
        final CalculateGcState state = new CalculateGcState();
        for (int i=1; i<lastWindowStart; ++i) {
            final int windowEnd = i + WINDOW_SIZE;
            final int windowGc = calculateGc(refBases, i, windowEnd, state) ;
            gc[i] = (byte) windowGc;
            if (windowGc != -1) windowsByGc[windowGc]++;
        }
        return gc;
    }

    /**
     * Calculates GC as a number from 0 to 100 in the specified window. If the window includes
     * more than five no-calls then -1 is returned.
     */
    private int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) {
        if (state.init) {
            state.init = false ;
            state.gcCount = 0;
            state.nCount  = 0;
            for (int i=startIndex; i<endIndex; ++i) {
                final byte base = bases[i];
                if (base == 'G' || base == 'C') ++state.gcCount;
                else if (base == 'N') ++state.nCount;
            }
        } else {
            final byte newBase = bases[endIndex-1];
            if (newBase == 'G' || newBase == 'C') ++state.gcCount;
            else if (newBase == 'N') ++state.nCount;

            if (state.priorBase == 'G' || state.priorBase == 'C') --state.gcCount;
            else if (state.priorBase == 'N') --state.nCount;
        }
        state.priorBase = bases[startIndex];
        if (state.nCount > 4) return -1;
        else return (state.gcCount * 100) / (endIndex - startIndex);
    }

    /** Keeps track of current GC calculation state. */
    class CalculateGcState {
        boolean init = true ;
        int nCount ;
        int gcCount ;
        byte priorBase ;
    }
}
TOP

Related Classes of picard.analysis.CollectGcBiasMetrics$CalculateGcState

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.