Source Code of picard.analysis.CollectGcBiasMetrics$CalculateGcState

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */


package picard.analysis;


import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.QualityUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
import picard.cmdline.StandardOptionDefinitions;
import picard.util.RExecutor;


import java.io.File;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.List;


/**
 * Tool to collect information about GC bias in the reads in a given BAM file. Computes
 * the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%
 * and counts the number of read starts in each GC bin.  What is output and plotted is
 * the "normalized coverage" in each bin - i.e. the number of reads per window normalized
 * to the average number of reads per window across the whole genome.
 *
 * @author Tim Fennell
 */
@CommandLineProgramProperties(
        usage = "Tool to collect information about GC bias in the reads in a given BAM file. Computes" +
                " the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%" +
                " and counts the number of read starts in each GC bin.  What is output and plotted is" +
                " the \"normalized coverage\" in each bin - i.e. the number of reads per window normalized" +
                " to the average number of reads per window across the whole genome..\n",
        usageShort = "Collects information about GC bias in the reads in the provided SAM or BAM",
        programGroup = Metrics.class
)
public class CollectGcBiasMetrics extends CommandLineProgram {
    /** The location of the R script to do the plotting. */
    private static final String R_SCRIPT = "picard/analysis/gcBias.R";


    // Usage and parameters


    @Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc="The reference sequence fasta file.")
    public File REFERENCE_SEQUENCE;


    @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The BAM or SAM file containing aligned reads.  " +
            "Must be coordinate-sorted.")
    public File INPUT;


    @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The text file to write the metrics table to.")
    public File OUTPUT;


    @Option(shortName="CHART", doc="The PDF file to render the chart to.")
    public File CHART_OUTPUT;


    @Option(shortName="S", doc="The text file to write summary metrics to.", optional=true)
    public File SUMMARY_OUTPUT;


    @Option(doc="The size of windows on the genome that are used to bin reads.")
    public int WINDOW_SIZE = 100;


    @Option(doc="For summary metrics, exclude GC windows that include less than this fraction of the genome.")
    public double MINIMUM_GENOME_FRACTION = 0.00001;


    @Option(doc="If true, assume that the input file is coordinate sorted, even if the header says otherwise.",
            shortName=StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME)
    public boolean ASSUME_SORTED = false;


    @Option(shortName="BS", doc="Whether the SAM or BAM file consists of bisulfite sequenced reads.  ")
    public boolean IS_BISULFITE_SEQUENCED = false;


    private static final Log log = Log.getInstance(CollectGcBiasMetrics.class);


    // Used to keep track of the total clusters as this is kinda important for bias
    private int totalClusters = 0;
    private int totalAlignedReads = 0;


    /** Stock main method. */
    public static void main(final String[] args) {
        System.exit(new CollectGcBiasMetrics().instanceMain(args));
    }


    protected int doWork() {
        IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
        IOUtil.assertFileIsReadable(INPUT);
        IOUtil.assertFileIsWritable(OUTPUT);
        IOUtil.assertFileIsWritable(CHART_OUTPUT);
        if (SUMMARY_OUTPUT != null) IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);


        // Histograms to track the number of windows at each GC, and the number of read starts
        // at windows of each GC
        final int[] windowsByGc = new int[101];
        final int[] readsByGc   = new int[101];
        final long[] basesByGc  = new long[101];
        final long[] errorsByGc = new long[101];


        final SAMFileReader sam = new SAMFileReader(INPUT);


        if (!ASSUME_SORTED && sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new PicardException("Header of input file " + INPUT.getAbsolutePath() + " indicates that it is not coordinate sorted.  " +
            "If you believe the records are in coordinate order, pass option ASSUME_SORTED=true.  If not, sort the file with SortSam.");
        }
        final PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(sam.iterator());
        final ReferenceSequenceFile referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);


        {
            // Check that the sequence dictionaries match if present
            final SAMSequenceDictionary referenceDictionary= referenceFile.getSequenceDictionary();
            final SAMSequenceDictionary samFileDictionary = sam.getFileHeader().getSequenceDictionary();
            if (referenceDictionary != null && samFileDictionary != null) {
                SequenceUtil.assertSequenceDictionariesEqual(referenceDictionary, samFileDictionary);
            }
        }


        ////////////////////////////////////////////////////////////////////////////
        // Loop over the reference and the reads and calculate the basic metrics
        ////////////////////////////////////////////////////////////////////////////
        ReferenceSequence ref = null;
        final ProgressLogger
                progress = new ProgressLogger(log);
        while ((ref = referenceFile.nextSequence()) != null) {
            final byte[] refBases = ref.getBases();
            StringUtil.toUpperCase(refBases);
            final int refLength = refBases.length;
            final int lastWindowStart = refLength - WINDOW_SIZE;


            final byte[] gc = calculateAllGcs(refBases, windowsByGc, lastWindowStart);
            while (iterator.hasNext() && iterator.peek().getReferenceIndex() == ref.getContigIndex()) {
                final SAMRecord rec = iterator.next();


                if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++this.totalClusters;


                if (!rec.getReadUnmappedFlag()) {
                    final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - WINDOW_SIZE : rec.getAlignmentStart();
                    ++this.totalAlignedReads;


                    if (pos > 0) {
                        final int windowGc = gc[pos];


                        if (windowGc >= 0) {
                            ++readsByGc[windowGc];
                            basesByGc[windowGc]  += rec.getReadLength();
                            errorsByGc[windowGc] +=
                                    SequenceUtil.countMismatches(rec, refBases, IS_BISULFITE_SEQUENCED) +
                                            SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec);
                        }
                    }
                }


                progress.record(rec);
            }


            log.info("Processed reference sequence: " + ref.getName());
        }


        // Finish up the reads, presumably all unaligned
        while (iterator.hasNext()) {
            final SAMRecord rec = iterator.next();
            if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++this.totalClusters;


        }


        /////////////////////////////////////////////////////////////////////////////
        // Synthesize the normalized coverage metrics and write it all out to a file
        /////////////////////////////////////////////////////////////////////////////
        final MetricsFile<GcBiasDetailMetrics,?> metricsFile = getMetricsFile();
        final double totalWindows = sum(windowsByGc);
        final double totalReads   = sum(readsByGc);
        final double meanReadsPerWindow = totalReads / totalWindows;
        final double minimumWindowsToCountInSummary = totalWindows * this.MINIMUM_GENOME_FRACTION;


        for (int i=0; i<windowsByGc.length; ++i) {
            if (windowsByGc[i] == 0) continue;


            GcBiasDetailMetrics m = new GcBiasDetailMetrics();
            m.GC = i;
            m.WINDOWS             = windowsByGc[i];
            m.READ_STARTS         = readsByGc[i];
            if (errorsByGc[i] > 0) m.MEAN_BASE_QUALITY = QualityUtil.getPhredScoreFromObsAndErrors(basesByGc[i], errorsByGc[i]);
            m.NORMALIZED_COVERAGE = (m.READ_STARTS / (double) m.WINDOWS) / meanReadsPerWindow;
            m.ERROR_BAR_WIDTH     = (Math.sqrt(m.READ_STARTS) / (double) m.WINDOWS) / meanReadsPerWindow;


            metricsFile.addMetric(m);
        }


        metricsFile.write(OUTPUT);


        // Synthesize the high level metrics
        if (SUMMARY_OUTPUT != null) {
            final MetricsFile<GcBiasSummaryMetrics,?> summaryMetricsFile = getMetricsFile();
            final GcBiasSummaryMetrics summary = new GcBiasSummaryMetrics();
            summary.WINDOW_SIZE    = this.WINDOW_SIZE;
            summary.TOTAL_CLUSTERS = this.totalClusters;
            summary.ALIGNED_READS  = this.totalAlignedReads;
            calculateDropoutMetrics(metricsFile.getMetrics(), summary);


            summaryMetricsFile.addMetric(summary);
            summaryMetricsFile.write(SUMMARY_OUTPUT);
        }


        // Plot the results
        final NumberFormat fmt = NumberFormat.getIntegerInstance();
        fmt.setGroupingUsed(true);
        final String subtitle = "Total clusters: " + fmt.format(this.totalClusters) +
                                ", Aligned reads: " + fmt.format(this.totalAlignedReads);
        String title = INPUT.getName().replace(".duplicates_marked", "").replace(".aligned.bam", "");


        // Qualify the title with the library name iff it's for a single sample
        final List<SAMReadGroupRecord> readGroups = sam.getFileHeader().getReadGroups();
        if (readGroups.size() == 1) {
            title += "." + readGroups.get(0).getLibrary();
        }
        RExecutor.executeFromClasspath(R_SCRIPT,
                                       OUTPUT.getAbsolutePath(),
                                       CHART_OUTPUT.getAbsolutePath(),
                                       title,
                                       subtitle,
                                       String.valueOf(WINDOW_SIZE));


        return 0;
    }


    /** Sums the values in an int[]. */
    private double sum(final int[] values) {
        final int length = values.length;
        double total = 0;
        for (int i=0; i<length; ++i) {
            total += values[i];
        }


        return total;
    }


    /** Calculates the Illumina style AT and GC dropout numbers. */
    private void calculateDropoutMetrics(final Collection<GcBiasDetailMetrics> details,
                                         final GcBiasSummaryMetrics summary) {
        // First calculate the totals
        double totalReads   = 0;
        double totalWindows = 0;


        for (final GcBiasDetailMetrics detail : details) {
            totalReads += detail.READ_STARTS;
            totalWindows += detail.WINDOWS;
        }


        double atDropout = 0;
        double gcDropout = 0;


        for (final GcBiasDetailMetrics detail : details) {
            final double relativeReads   = detail.READ_STARTS / totalReads;
            final double relativeWindows = detail.WINDOWS / totalWindows;
            final double dropout = (relativeWindows - relativeReads) * 100;


            if (dropout > 0) {
                if (detail.GC <= 50) atDropout += dropout;
                if (detail.GC >= 50) gcDropout += dropout;
            }
        }


        summary.AT_DROPOUT = atDropout;
        summary.GC_DROPOUT = gcDropout;
    }


    /** Calculcate all the GC values for all windows. */
    private byte[] calculateAllGcs(final byte [] refBases, final int [] windowsByGc, final int lastWindowStart) {
        final int refLength = refBases.length;
        final byte[] gc = new byte[refLength + 1];
        final CalculateGcState state = new CalculateGcState();
        for (int i=1; i<lastWindowStart; ++i) {
            final int windowEnd = i + WINDOW_SIZE;
            final int windowGc = calculateGc(refBases, i, windowEnd, state) ; 
            gc[i] = (byte) windowGc;
            if (windowGc != -1) windowsByGc[windowGc]++;
        }
        return gc;
    }


    /**
     * Calculates GC as a number from 0 to 100 in the specified window. If the window includes
     * more than five no-calls then -1 is returned.
     */
    private int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) {
        if (state.init) {
            state.init = false ;
            state.gcCount = 0;
            state.nCount  = 0;
            for (int i=startIndex; i<endIndex; ++i) {
                final byte base = bases[i];
                if (base == 'G' || base == 'C') ++state.gcCount;
                else if (base == 'N') ++state.nCount;
            }
        } else {
            final byte newBase = bases[endIndex-1];
            if (newBase == 'G' || newBase == 'C') ++state.gcCount;
            else if (newBase == 'N') ++state.nCount;


            if (state.priorBase == 'G' || state.priorBase == 'C') --state.gcCount;
            else if (state.priorBase == 'N') --state.nCount;
        }
        state.priorBase = bases[startIndex];
        if (state.nCount > 4) return -1;
        else return (state.gcCount * 100) / (endIndex - startIndex);
    }


    /** Keeps track of current GC calculation state. */
    class CalculateGcState {
        boolean init = true ;
        int nCount ;
        int gcCount ;
        byte priorBase ;
    }
}
Source Code of picard.analysis.CollectGcBiasMetrics$CalculateGcState

Related Classes of picard.analysis.CollectGcBiasMetrics$CalculateGcState