/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.QualityUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
import picard.cmdline.StandardOptionDefinitions;
import picard.util.RExecutor;
import java.io.File;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.List;
/**
* Tool to collect information about GC bias in the reads in a given BAM file. Computes
* the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%
* and counts the number of read starts in each GC bin. What is output and plotted is
* the "normalized coverage" in each bin - i.e. the number of reads per window normalized
* to the average number of reads per window across the whole genome.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = "Tool to collect information about GC bias in the reads in a given BAM file. Computes" +
" the number of windows (of size specified by WINDOW_SIZE) in the genome at each GC%" +
" and counts the number of read starts in each GC bin. What is output and plotted is" +
" the \"normalized coverage\" in each bin - i.e. the number of reads per window normalized" +
" to the average number of reads per window across the whole genome..\n",
usageShort = "Collects information about GC bias in the reads in the provided SAM or BAM",
programGroup = Metrics.class
)
public class CollectGcBiasMetrics extends CommandLineProgram {
/** The location of the R script to do the plotting. */
private static final String R_SCRIPT = "picard/analysis/gcBias.R";
// Usage and parameters
@Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc="The reference sequence fasta file.")
public File REFERENCE_SEQUENCE;
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The BAM or SAM file containing aligned reads. " +
"Must be coordinate-sorted.")
public File INPUT;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The text file to write the metrics table to.")
public File OUTPUT;
@Option(shortName="CHART", doc="The PDF file to render the chart to.")
public File CHART_OUTPUT;
@Option(shortName="S", doc="The text file to write summary metrics to.", optional=true)
public File SUMMARY_OUTPUT;
@Option(doc="The size of windows on the genome that are used to bin reads.")
public int WINDOW_SIZE = 100;
@Option(doc="For summary metrics, exclude GC windows that include less than this fraction of the genome.")
public double MINIMUM_GENOME_FRACTION = 0.00001;
@Option(doc="If true, assume that the input file is coordinate sorted, even if the header says otherwise.",
shortName=StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME)
public boolean ASSUME_SORTED = false;
@Option(shortName="BS", doc="Whether the SAM or BAM file consists of bisulfite sequenced reads. ")
public boolean IS_BISULFITE_SEQUENCED = false;
private static final Log log = Log.getInstance(CollectGcBiasMetrics.class);
// Used to keep track of the total clusters as this is kinda important for bias
private int totalClusters = 0;
private int totalAlignedReads = 0;
/** Stock main method. */
public static void main(final String[] args) {
System.exit(new CollectGcBiasMetrics().instanceMain(args));
}
protected int doWork() {
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(CHART_OUTPUT);
if (SUMMARY_OUTPUT != null) IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);
// Histograms to track the number of windows at each GC, and the number of read starts
// at windows of each GC
final int[] windowsByGc = new int[101];
final int[] readsByGc = new int[101];
final long[] basesByGc = new long[101];
final long[] errorsByGc = new long[101];
final SAMFileReader sam = new SAMFileReader(INPUT);
if (!ASSUME_SORTED && sam.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new PicardException("Header of input file " + INPUT.getAbsolutePath() + " indicates that it is not coordinate sorted. " +
"If you believe the records are in coordinate order, pass option ASSUME_SORTED=true. If not, sort the file with SortSam.");
}
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(sam.iterator());
final ReferenceSequenceFile referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
{
// Check that the sequence dictionaries match if present
final SAMSequenceDictionary referenceDictionary= referenceFile.getSequenceDictionary();
final SAMSequenceDictionary samFileDictionary = sam.getFileHeader().getSequenceDictionary();
if (referenceDictionary != null && samFileDictionary != null) {
SequenceUtil.assertSequenceDictionariesEqual(referenceDictionary, samFileDictionary);
}
}
////////////////////////////////////////////////////////////////////////////
// Loop over the reference and the reads and calculate the basic metrics
////////////////////////////////////////////////////////////////////////////
ReferenceSequence ref = null;
final ProgressLogger
progress = new ProgressLogger(log);
while ((ref = referenceFile.nextSequence()) != null) {
final byte[] refBases = ref.getBases();
StringUtil.toUpperCase(refBases);
final int refLength = refBases.length;
final int lastWindowStart = refLength - WINDOW_SIZE;
final byte[] gc = calculateAllGcs(refBases, windowsByGc, lastWindowStart);
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == ref.getContigIndex()) {
final SAMRecord rec = iterator.next();
if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++this.totalClusters;
if (!rec.getReadUnmappedFlag()) {
final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - WINDOW_SIZE : rec.getAlignmentStart();
++this.totalAlignedReads;
if (pos > 0) {
final int windowGc = gc[pos];
if (windowGc >= 0) {
++readsByGc[windowGc];
basesByGc[windowGc] += rec.getReadLength();
errorsByGc[windowGc] +=
SequenceUtil.countMismatches(rec, refBases, IS_BISULFITE_SEQUENCED) +
SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec);
}
}
}
progress.record(rec);
}
log.info("Processed reference sequence: " + ref.getName());
}
// Finish up the reads, presumably all unaligned
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++this.totalClusters;
}
/////////////////////////////////////////////////////////////////////////////
// Synthesize the normalized coverage metrics and write it all out to a file
/////////////////////////////////////////////////////////////////////////////
final MetricsFile<GcBiasDetailMetrics,?> metricsFile = getMetricsFile();
final double totalWindows = sum(windowsByGc);
final double totalReads = sum(readsByGc);
final double meanReadsPerWindow = totalReads / totalWindows;
final double minimumWindowsToCountInSummary = totalWindows * this.MINIMUM_GENOME_FRACTION;
for (int i=0; i<windowsByGc.length; ++i) {
if (windowsByGc[i] == 0) continue;
GcBiasDetailMetrics m = new GcBiasDetailMetrics();
m.GC = i;
m.WINDOWS = windowsByGc[i];
m.READ_STARTS = readsByGc[i];
if (errorsByGc[i] > 0) m.MEAN_BASE_QUALITY = QualityUtil.getPhredScoreFromObsAndErrors(basesByGc[i], errorsByGc[i]);
m.NORMALIZED_COVERAGE = (m.READ_STARTS / (double) m.WINDOWS) / meanReadsPerWindow;
m.ERROR_BAR_WIDTH = (Math.sqrt(m.READ_STARTS) / (double) m.WINDOWS) / meanReadsPerWindow;
metricsFile.addMetric(m);
}
metricsFile.write(OUTPUT);
// Synthesize the high level metrics
if (SUMMARY_OUTPUT != null) {
final MetricsFile<GcBiasSummaryMetrics,?> summaryMetricsFile = getMetricsFile();
final GcBiasSummaryMetrics summary = new GcBiasSummaryMetrics();
summary.WINDOW_SIZE = this.WINDOW_SIZE;
summary.TOTAL_CLUSTERS = this.totalClusters;
summary.ALIGNED_READS = this.totalAlignedReads;
calculateDropoutMetrics(metricsFile.getMetrics(), summary);
summaryMetricsFile.addMetric(summary);
summaryMetricsFile.write(SUMMARY_OUTPUT);
}
// Plot the results
final NumberFormat fmt = NumberFormat.getIntegerInstance();
fmt.setGroupingUsed(true);
final String subtitle = "Total clusters: " + fmt.format(this.totalClusters) +
", Aligned reads: " + fmt.format(this.totalAlignedReads);
String title = INPUT.getName().replace(".duplicates_marked", "").replace(".aligned.bam", "");
// Qualify the title with the library name iff it's for a single sample
final List<SAMReadGroupRecord> readGroups = sam.getFileHeader().getReadGroups();
if (readGroups.size() == 1) {
title += "." + readGroups.get(0).getLibrary();
}
RExecutor.executeFromClasspath(R_SCRIPT,
OUTPUT.getAbsolutePath(),
CHART_OUTPUT.getAbsolutePath(),
title,
subtitle,
String.valueOf(WINDOW_SIZE));
return 0;
}
/** Sums the values in an int[]. */
private double sum(final int[] values) {
final int length = values.length;
double total = 0;
for (int i=0; i<length; ++i) {
total += values[i];
}
return total;
}
/** Calculates the Illumina style AT and GC dropout numbers. */
private void calculateDropoutMetrics(final Collection<GcBiasDetailMetrics> details,
final GcBiasSummaryMetrics summary) {
// First calculate the totals
double totalReads = 0;
double totalWindows = 0;
for (final GcBiasDetailMetrics detail : details) {
totalReads += detail.READ_STARTS;
totalWindows += detail.WINDOWS;
}
double atDropout = 0;
double gcDropout = 0;
for (final GcBiasDetailMetrics detail : details) {
final double relativeReads = detail.READ_STARTS / totalReads;
final double relativeWindows = detail.WINDOWS / totalWindows;
final double dropout = (relativeWindows - relativeReads) * 100;
if (dropout > 0) {
if (detail.GC <= 50) atDropout += dropout;
if (detail.GC >= 50) gcDropout += dropout;
}
}
summary.AT_DROPOUT = atDropout;
summary.GC_DROPOUT = gcDropout;
}
/** Calculcate all the GC values for all windows. */
private byte[] calculateAllGcs(final byte [] refBases, final int [] windowsByGc, final int lastWindowStart) {
final int refLength = refBases.length;
final byte[] gc = new byte[refLength + 1];
final CalculateGcState state = new CalculateGcState();
for (int i=1; i<lastWindowStart; ++i) {
final int windowEnd = i + WINDOW_SIZE;
final int windowGc = calculateGc(refBases, i, windowEnd, state) ;
gc[i] = (byte) windowGc;
if (windowGc != -1) windowsByGc[windowGc]++;
}
return gc;
}
/**
* Calculates GC as a number from 0 to 100 in the specified window. If the window includes
* more than five no-calls then -1 is returned.
*/
private int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) {
if (state.init) {
state.init = false ;
state.gcCount = 0;
state.nCount = 0;
for (int i=startIndex; i<endIndex; ++i) {
final byte base = bases[i];
if (base == 'G' || base == 'C') ++state.gcCount;
else if (base == 'N') ++state.nCount;
}
} else {
final byte newBase = bases[endIndex-1];
if (newBase == 'G' || newBase == 'C') ++state.gcCount;
else if (newBase == 'N') ++state.nCount;
if (state.priorBase == 'G' || state.priorBase == 'C') --state.gcCount;
else if (state.priorBase == 'N') --state.nCount;
}
state.priorBase = bases[startIndex];
if (state.nCount > 4) return -1;
else return (state.gcCount * 100) / (endIndex - startIndex);
}
/** Keeps track of current GC calculation state. */
class CalculateGcState {
boolean init = true ;
int nCount ;
int gcCount ;
byte priorBase ;
}
}