package picard.analysis;
import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.BAMRecord;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.metrics.PerUnitMetricCollector;
import picard.metrics.SAMRecordAndReference;
import picard.metrics.SAMRecordAndReferenceMultiLevelCollector;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class AlignmentSummaryMetricsCollector extends SAMRecordAndReferenceMultiLevelCollector<AlignmentSummaryMetrics, Comparable<?>> {
// If we have a reference sequence, collect metrics on how well we aligned to it
private final boolean doRefMetrics;
//the adapter sequences converted to byte arrays
private final byte[][] adapterKmers;
//A list of Strings representing the sequence of bases in an adapter
private final List<String> adapterSequence;
//Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs.
private final int maxInsertSize;
//Whether the SAM or BAM file consists of bisulfite sequenced reads.
private final boolean isBisulfiteSequenced;
//The minimum mapping quality a base has to meet in order to be considered high quality
private final int MAPPING_QUALITY_THRESOLD = 20;
//The minimum quality a base has to meet in order to be consider hq_20
private final static int BASE_QUALITY_THRESHOLD = 20;
//The number of bases to check in order to map a read to an adapter
private static final int ADAPTER_MATCH_LENGTH = 16;
// The maximum number of mismatches a read can have and still be considered as matching an adapter
private static final int MAX_ADAPTER_ERRORS = 1;
public AlignmentSummaryMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final List<SAMReadGroupRecord> samRgRecords,
final boolean doRefMetrics, final List<String> adapterSequence, final int maxInsertSize, boolean isBisulfiteSequenced) {
this.doRefMetrics = doRefMetrics;
this.adapterSequence = adapterSequence;
this.adapterKmers = prepareAdapterSequences();
this.maxInsertSize = maxInsertSize;
this.isBisulfiteSequenced = isBisulfiteSequenced;
setup(accumulationLevels, samRgRecords);
}
@Override
protected PerUnitMetricCollector<AlignmentSummaryMetrics, Comparable<?>, SAMRecordAndReference> makeChildCollector(String sample, String library, String readGroup) {
return new GroupAlignmentSummaryMetricsPerUnitMetricCollector(sample, library, readGroup);
}
@Override
public void acceptRecord(final SAMRecord rec, final ReferenceSequence ref) {
if (!rec.isSecondaryOrSupplementary()) {
super.acceptRecord(rec, ref);
}
}
/** Converts the supplied adapter sequences to byte arrays in both fwd and rc. */
private byte [][] prepareAdapterSequences() {
final Set<String> kmers = new HashSet<String>();
// Make a set of all kmers of adapterMatchLength
for (final String seq : adapterSequence) {
for (int i=0; i<=seq.length() - ADAPTER_MATCH_LENGTH; ++i) {
final String kmer = seq.substring(i, i+ADAPTER_MATCH_LENGTH).toUpperCase();
int ns = 0;
for (final char ch : kmer.toCharArray()) if (ch == 'N') ++ns;
if (ns <= MAX_ADAPTER_ERRORS) {
kmers.add(kmer);
kmers.add(SequenceUtil.reverseComplement(kmer));
}
}
}
// Make an array of byte[] for the kmers
final byte [][] adapterKmers = new byte[kmers.size()][];
int i=0;
for (final String kmer : kmers) {
adapterKmers[i++] = StringUtil.stringToBytes(kmer);
}
return adapterKmers;
}
/**
* Checks the first ADAPTER_MATCH_LENGTH bases of the read against known adapter sequences and returns
* true if the read matches an adapter sequence with MAX_ADAPTER_ERRORS mismsatches or fewer.
*
* @param read the basecalls for the read in the order and orientation the machine read them
* @return true if the read matches an adapter and false otherwise
*/
private boolean isAdapterSequence(final byte[] read) {
if (read.length < ADAPTER_MATCH_LENGTH) return false;
for (final byte[] adapter : adapterKmers) {
int errors = 0;
for (int i=0; i<adapter.length; ++i) {
if (read[i] != adapter[i]) {
if (++errors > MAX_ADAPTER_ERRORS) break;
}
}
if (errors <= MAX_ADAPTER_ERRORS) return true;
}
return false;
}
private class GroupAlignmentSummaryMetricsPerUnitMetricCollector implements PerUnitMetricCollector<AlignmentSummaryMetrics, Comparable<?>, SAMRecordAndReference> {
final IndividualAlignmentSummaryMetricsCollector unpairedCollector;
final IndividualAlignmentSummaryMetricsCollector firstOfPairCollector;
final IndividualAlignmentSummaryMetricsCollector secondOfPairCollector;
final IndividualAlignmentSummaryMetricsCollector pairCollector;
final String sample;
final String library;
final String readGroup;
public GroupAlignmentSummaryMetricsPerUnitMetricCollector(final String sample, final String library, final String readGroup) {
this.sample = sample;
this.library = library;
this.readGroup = readGroup;
unpairedCollector = new IndividualAlignmentSummaryMetricsCollector(AlignmentSummaryMetrics.Category.UNPAIRED, sample, library, readGroup);
firstOfPairCollector = new IndividualAlignmentSummaryMetricsCollector(AlignmentSummaryMetrics.Category.FIRST_OF_PAIR, sample, library, readGroup);
secondOfPairCollector = new IndividualAlignmentSummaryMetricsCollector(AlignmentSummaryMetrics.Category.SECOND_OF_PAIR, sample, library, readGroup);
pairCollector = new IndividualAlignmentSummaryMetricsCollector(AlignmentSummaryMetrics.Category.PAIR, sample, library, readGroup);
}
public void acceptRecord(final SAMRecordAndReference args) {
final SAMRecord rec = args.getSamRecord();
final ReferenceSequence ref = args.getReferenceSequence();
if (rec.getReadPairedFlag()) {
if (rec.getFirstOfPairFlag()) {
firstOfPairCollector.addRecord(rec, ref);
}
else {
secondOfPairCollector.addRecord(rec, ref);
}
pairCollector.addRecord(rec, ref);
}
else {
unpairedCollector.addRecord(rec, ref);
}
}
@Override
public void finish() {
// Let the collectors do any summary computations etc.
unpairedCollector.onComplete();
firstOfPairCollector.onComplete();
secondOfPairCollector.onComplete();
pairCollector.onComplete();
}
@Override
public void addMetricsToFile(final MetricsFile<AlignmentSummaryMetrics, Comparable<?>> file) {
if (firstOfPairCollector.getMetrics().TOTAL_READS > 0) {
// override how bad cycle is determined for paired reads, it should be
// the sum of first and second reads
pairCollector.getMetrics().BAD_CYCLES = firstOfPairCollector.getMetrics().BAD_CYCLES +
secondOfPairCollector.getMetrics().BAD_CYCLES;
file.addMetric(firstOfPairCollector.getMetrics());
file.addMetric(secondOfPairCollector.getMetrics());
file.addMetric(pairCollector.getMetrics());
}
//if there are no reads in any category then we will returned an unpaired alignment summary metric with all zero values
if (unpairedCollector.getMetrics().TOTAL_READS > 0 || firstOfPairCollector.getMetrics().TOTAL_READS == 0) {
file.addMetric(unpairedCollector.getMetrics());
}
}
/**
* Class that counts reads that match various conditions
*/
private class IndividualAlignmentSummaryMetricsCollector {
private long numPositiveStrand = 0;
private final Histogram<Integer> readLengthHistogram = new Histogram<Integer>();
private AlignmentSummaryMetrics metrics;
private long chimeras;
private long chimerasDenominator;
private long adapterReads;
private long indels;
private long nonBisulfiteAlignedBases = 0;
private long hqNonBisulfiteAlignedBases = 0;
private final Histogram<Long> mismatchHistogram = new Histogram<Long>();
private final Histogram<Long> hqMismatchHistogram = new Histogram<Long>();
private final Histogram<Integer> badCycleHistogram = new Histogram<Integer>();
public IndividualAlignmentSummaryMetricsCollector(final AlignmentSummaryMetrics.Category pairingCategory,
final String sample,
final String library,
final String readGroup) {
metrics = new AlignmentSummaryMetrics();
metrics.CATEGORY = pairingCategory;
metrics.SAMPLE = sample;
metrics.LIBRARY = library;
metrics.READ_GROUP = readGroup;
}
public void addRecord(final SAMRecord record, final ReferenceSequence ref) {
if (record.isSecondaryOrSupplementary()) {
// only want 1 count per read so skip non primary alignments
return;
}
collectReadData(record, ref);
collectQualityData(record, ref);
}
public void onComplete() {
//summarize read data
if (metrics.TOTAL_READS > 0)
{
metrics.PCT_PF_READS = (double) metrics.PF_READS / (double) metrics.TOTAL_READS;
metrics.PCT_ADAPTER = this.adapterReads / (double) metrics.PF_READS;
metrics.MEAN_READ_LENGTH = readLengthHistogram.getMean();
//Calculate BAD_CYCLES
metrics.BAD_CYCLES = 0;
for (final Histogram<Integer>.Bin cycleBin : badCycleHistogram.values()) {
final double badCyclePercentage = cycleBin.getValue() / metrics.TOTAL_READS;
if (badCyclePercentage >= .8) {
metrics.BAD_CYCLES++;
}
}
if(doRefMetrics) {
if (metrics.PF_READS > 0) metrics.PCT_PF_READS_ALIGNED = (double) metrics.PF_READS_ALIGNED / (double) metrics.PF_READS;
if (metrics.PF_READS_ALIGNED > 0) metrics.PCT_READS_ALIGNED_IN_PAIRS = (double) metrics.READS_ALIGNED_IN_PAIRS/ (double) metrics.PF_READS_ALIGNED;
if (metrics.PF_READS_ALIGNED > 0) metrics.STRAND_BALANCE = numPositiveStrand / (double) metrics.PF_READS_ALIGNED;
if (this.chimerasDenominator > 0) metrics.PCT_CHIMERAS = this.chimeras / (double) this.chimerasDenominator;
if (nonBisulfiteAlignedBases > 0) metrics.PF_MISMATCH_RATE = mismatchHistogram.getSum() / (double) nonBisulfiteAlignedBases;
metrics.PF_HQ_MEDIAN_MISMATCHES = hqMismatchHistogram.getMedian();
if (hqNonBisulfiteAlignedBases > 0) metrics.PF_HQ_ERROR_RATE = hqMismatchHistogram.getSum() / (double) hqNonBisulfiteAlignedBases;
if (metrics.PF_ALIGNED_BASES > 0) metrics.PF_INDEL_RATE = this.indels / (double) metrics.PF_ALIGNED_BASES;
}
}
}
private void collectReadData(final SAMRecord record, final ReferenceSequence ref) {
metrics.TOTAL_READS++;
readLengthHistogram.increment(record.getReadBases().length);
if (!record.getReadFailsVendorQualityCheckFlag()) {
metrics.PF_READS++;
if (isNoiseRead(record)) metrics.PF_NOISE_READS++;
if (record.getReadUnmappedFlag()) {
// If the read is unmapped see if it's adapter sequence
final byte[] readBases = record.getReadBases();
if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases);
if (isAdapterSequence(readBases)) {
this.adapterReads++;
}
}
else if(doRefMetrics) {
metrics.PF_READS_ALIGNED++;
if (!record.getReadNegativeStrandFlag()) numPositiveStrand++;
if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
metrics.READS_ALIGNED_IN_PAIRS++;
// Check that both ends have mapq > minimum
final Integer mateMq = record.getIntegerAttribute("MQ");
if (mateMq == null || mateMq >= MAPPING_QUALITY_THRESOLD && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) {
++this.chimerasDenominator;
// With both reads mapped we can see if this pair is chimeric
if (Math.abs(record.getInferredInsertSize()) > maxInsertSize ||
!record.getReferenceIndex().equals(record.getMateReferenceIndex())) {
++this.chimeras;
}
}
}
}
}
}
private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) {
// If the read isnt an aligned PF read then look at the read for no-calls
if (record.getReadUnmappedFlag() || record.getReadFailsVendorQualityCheckFlag() || !doRefMetrics) {
final byte[] readBases = record.getReadBases();
for (int i = 0; i < readBases.length; i++) {
if (SequenceUtil.isNoCall(readBases[i])) {
badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
}
}
}
else if (!record.getReadFailsVendorQualityCheckFlag()) {
final boolean highQualityMapping = isHighQualityMapping(record);
if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++;
final byte[] readBases = record.getReadBases();
final byte[] refBases = reference.getBases();
final byte[] qualities = record.getBaseQualities();
final int refLength = refBases.length;
long mismatchCount = 0;
long hqMismatchCount = 0;
for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) {
final int readIndex = alignmentBlock.getReadStart() - 1;
final int refIndex = alignmentBlock.getReferenceStart() - 1;
final int length = alignmentBlock.getLength();
for (int i=0; i<length && refIndex+i<refLength; ++i) {
final int readBaseIndex = readIndex + i;
boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex+i]);
boolean bisulfiteBase = false;
if (mismatch && isBisulfiteSequenced) {
if ( (record.getReadNegativeStrandFlag() &&
(refBases[refIndex+i] == 'G' || refBases[refIndex+i] =='g') &&
(readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a'))
|| ((!record.getReadNegativeStrandFlag()) &&
(refBases[refIndex+i] == 'C' || refBases[refIndex+i] == 'c') &&
(readBases[readBaseIndex] == 'T') || readBases[readBaseIndex] == 't') ) {
bisulfiteBase = true;
mismatch = false;
}
}
if(mismatch) mismatchCount++;
metrics.PF_ALIGNED_BASES++;
if(!bisulfiteBase) nonBisulfiteAlignedBases++;
if (highQualityMapping) {
metrics.PF_HQ_ALIGNED_BASES++;
if (!bisulfiteBase) hqNonBisulfiteAlignedBases++;
if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) metrics.PF_HQ_ALIGNED_Q20_BASES++;
if (mismatch) hqMismatchCount++;
}
if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) {
badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
}
}
}
mismatchHistogram.increment(mismatchCount);
hqMismatchHistogram.increment(hqMismatchCount);
// Add any insertions and/or deletions to the global count
for (final CigarElement elem : record.getCigar().getCigarElements()) {
final CigarOperator op = elem.getOperator();
if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++ this.indels;
}
}
}
private boolean isNoiseRead(final SAMRecord record) {
final Object noiseAttribute = record.getAttribute(ReservedTagConstants.XN);
return (noiseAttribute != null && noiseAttribute.equals(1));
}
private boolean isHighQualityMapping(final SAMRecord record) {
return !record.getReadFailsVendorQualityCheckFlag() &&
record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD;
}
public AlignmentSummaryMetrics getMetrics() {
return this.metrics;
}
}
}
}