/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import picard.PicardException;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.Log;
import picard.cmdline.programgroups.Metrics;
import picard.sam.DuplicationMetrics;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import picard.sam.markduplicates.util.AbstractOpticalDuplicateFinderCommandLineProgram;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
import java.io.*;
import java.util.*;
import static java.lang.Math.pow;
/**
* <p>Attempts to estimate library complexity from sequence alone. Does so by sorting all reads
* by the first N bases (5 by default) of each read and then comparing reads with the first
* N bases identical to each other for duplicates. Reads are considered to be duplicates if
* they match each other with no gaps and an overall mismatch rate less than or equal to
* MAX_DIFF_RATE (0.03 by default).</p>
* <p/>
* <p>Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering
* removes reads with any no-calls in the first N bases or with a mean base quality lower than
* MIN_MEAN_QUALITY across either the first or second read.</p>
* <p/>
* <p>The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes
* these in the calculation of library size. Also, since there is no alignment to screen out technical
* reads one further filter is applied on the data. After examining all reads a Histogram is built of
* [#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are
* then removed from the Histogram as outliers before library size is estimated.</p>
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = "Attempts to estimate library complexity from sequence of read pairs alone. Does so by sorting all reads " +
"by the first N bases (5 by default) of each read and then comparing reads with the first " +
"N bases identical to each other for duplicates. Reads are considered to be duplicates if " +
"they match each other with no gaps and an overall mismatch rate less than or equal to " +
"MAX_DIFF_RATE (0.03 by default).\n\n" +
"Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering " +
"removes reads with any no-calls in the first N bases or with a mean base quality lower than " +
"MIN_MEAN_QUALITY across either the first or second read.\n\n" +
"Unpaired reads are ignored in this computation.\n\n" +
"The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes " +
"these in the calculation of library size. Also, since there is no alignment to screen out technical " +
"reads one further filter is applied on the data. After examining all reads a Histogram is built of " +
"[#reads in duplicate set -> #of duplicate sets] all bins that contain exactly one duplicate set are " +
"then removed from the Histogram as outliers before library size is estimated.",
usageShort = "Estimates library complexity from the sequence of read pairs",
programGroup = Metrics.class
)
public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCommandLineProgram {
@Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="One or more files to combine and " +
"estimate library complexity from. Reads can be mapped or unmapped.")
public List<File> INPUT;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
doc = "Output file to writes per-library metrics to.")
public File OUTPUT;
@Option(doc = "The minimum number of bases at the starts of reads that must be identical for reads to " +
"be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will " +
"be compared at a time, so lower numbers will produce more accurate results but consume " +
"exponentially more memory and CPU.")
public int MIN_IDENTICAL_BASES = 5;
@Option(doc = "The maximum rate of differences between two reads to call them identical.")
public double MAX_DIFF_RATE = 0.03;
@Option(doc = "The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with " +
"lower average quality are filtered out and not considered in any calculations.")
public int MIN_MEAN_QUALITY = 20;
@Option(doc = "Do not process self-similar groups that are this many times over the mean expected group size. " +
"I.e. if the input contains 10m read pairs and MIN_IDENTICAL_BASES is set to 5, then the mean expected " +
"group size would be approximately 10 reads.")
public int MAX_GROUP_RATIO = 500;
private final Log log = Log.getInstance(EstimateLibraryComplexity.class);
/**
* Little class to hold the sequence of a pair of reads and tile location information.
*/
static class PairedReadSequence implements OpticalDuplicateFinder.PhysicalLocation {
static int size_in_bytes = 2 + 1 + 4 + 1 + 300; // rough guess at memory footprint
short readGroup = -1;
short tile = -1;
short x = -1, y = -1;
boolean qualityOk = true;
byte[] read1;
byte[] read2;
short libraryId;
public short getReadGroup() { return this.readGroup; }
public void setReadGroup(final short readGroup) { this.readGroup = readGroup; }
public short getTile() { return this.tile; }
public void setTile(final short tile) { this.tile = tile; }
public short getX() { return this.x; }
public void setX(final short x) { this.x = x; }
public short getY() { return this.y; }
public void setY(final short y) { this.y = y; }
public short getLibraryId() { return this.libraryId; }
public void setLibraryId(final short libraryId) { this.libraryId = libraryId; }
}
/**
* Codec class for writing and read PairedReadSequence objects.
*/
static class PairedReadCodec implements SortingCollection.Codec<PairedReadSequence> {
private DataOutputStream out;
private DataInputStream in;
public void setOutputStream(final OutputStream out) {
this.out = new DataOutputStream(out);
}
public void setInputStream(final InputStream in) {
this.in = new DataInputStream(in);
}
public void encode(final PairedReadSequence val) {
try {
this.out.writeShort(val.readGroup);
this.out.writeShort(val.tile);
this.out.writeShort(val.x);
this.out.writeShort(val.y);
this.out.writeInt(val.read1.length);
this.out.write(val.read1);
this.out.writeInt(val.read2.length);
this.out.write(val.read2);
} catch (IOException ioe) {
throw new PicardException("Error write out read pair.", ioe);
}
}
public PairedReadSequence decode() {
try {
final PairedReadSequence val = new PairedReadSequence();
try {
val.readGroup = this.in.readShort();
} catch (EOFException eof) {
return null;
}
val.tile = this.in.readShort();
val.x = this.in.readShort();
val.y = this.in.readShort();
int length = this.in.readInt();
val.read1 = new byte[length];
if (this.in.read(val.read1) != length) {
throw new PicardException("Could not read " + length + " bytes from temporary file.");
}
length = this.in.readInt();
val.read2 = new byte[length];
if (this.in.read(val.read2) != length) {
throw new PicardException("Could not read " + length + " bytes from temporary file.");
}
return val;
} catch (IOException ioe) {
throw new PicardException("Exception reading read pair.", ioe);
}
}
@Override
public SortingCollection.Codec<PairedReadSequence> clone() { return new PairedReadCodec(); }
}
/**
* Comparator that orders read pairs on the first N bases of both reads.
*/
class PairedReadComparator implements Comparator<PairedReadSequence> {
final int BASES = EstimateLibraryComplexity.this.MIN_IDENTICAL_BASES;
public int compare(final PairedReadSequence lhs, final PairedReadSequence rhs) {
// First compare the first N bases of the first read
for (int i = 0; i < BASES; ++i) {
final int retval = lhs.read1[i] - rhs.read1[i];
if (retval != 0) return retval;
}
// Then compare the first N bases of the second read
for (int i = 0; i < BASES; ++i) {
final int retval = lhs.read2[i] - rhs.read2[i];
if (retval != 0) return retval;
}
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
}
}
/** Stock main method. */
public static void main(final String[] args) {
new EstimateLibraryComplexity().instanceMainWithExit(args);
}
public EstimateLibraryComplexity() {
MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / PairedReadSequence.size_in_bytes) / 2;
}
/**
* Method that does most of the work. Reads through the input BAM file and extracts the
* read sequences of each read pair and sorts them via a SortingCollection. Then traverses
* the sorted reads and looks at small groups at a time to find duplicates.
*/
@Override
protected int doWork() {
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
log.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
int recordsRead = 0;
final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class,
new PairedReadCodec(),
new PairedReadComparator(),
MAX_RECORDS_IN_RAM,
TMP_DIR);
// Loop through the input files and pick out the read sequences etc.
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
for (final File f : INPUT) {
final Map<String, PairedReadSequence> pendingByName = new HashMap<String, PairedReadSequence>();
final SAMFileReader in = new SAMFileReader(f);
readGroups.addAll(in.getFileHeader().getReadGroups());
for (final SAMRecord rec : in) {
if (!rec.getReadPairedFlag()) continue;
if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
continue;
}
PairedReadSequence prs = pendingByName.remove(rec.getReadName());
if (prs == null) {
// Make a new paired read object and add RG and physical location information to it
prs = new PairedReadSequence();
if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
final SAMReadGroupRecord rg = rec.getReadGroup();
if (rg != null) prs.setReadGroup((short) readGroups.indexOf(rg));
}
pendingByName.put(rec.getReadName(), prs);
}
// Read passes quality check if both ends meet the mean quality criteria
final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(),
rec.getBaseQualities(),
MIN_IDENTICAL_BASES,
MIN_MEAN_QUALITY);
prs.qualityOk = prs.qualityOk && passesQualityCheck;
// Get the bases and restore them to their original orientation if necessary
final byte[] bases = rec.getReadBases();
if (rec.getReadNegativeStrandFlag()) SequenceUtil.reverseComplement(bases);
if (rec.getFirstOfPairFlag()) {
prs.read1 = bases;
} else {
prs.read2 = bases;
}
if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
sorter.add(prs);
}
progress.record(rec);
}
}
log.info("Finished reading - moving on to scanning for duplicates.");
// Now go through the sorted reads and attempt to find duplicates
final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<PairedReadSequence>(sorter.iterator());
final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<String, Histogram<Integer>>();
final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<String, Histogram<Integer>>();
int groupsProcessed = 0;
long lastLogTime = System.currentTimeMillis();
final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4, MIN_IDENTICAL_BASES * 2));
while (iterator.hasNext()) {
// Get the next group and split it apart by library
final List<PairedReadSequence> group = getNextGroup(iterator);
if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
final PairedReadSequence prs = group.get(0);
log.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " +
"Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " +
StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) +
" / " +
StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES));
} else {
final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups);
// Now process the reads by library
for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) {
final String library = entry.getKey();
final List<PairedReadSequence> seqs = entry.getValue();
Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
if (duplicationHisto == null) {
duplicationHisto = new Histogram<Integer>("duplication_group_count", library);
opticalHisto = new Histogram<Integer>("duplication_group_count", "optical_duplicates");
duplicationHistosByLibrary.put(library, duplicationHisto);
opticalHistosByLibrary.put(library, opticalHisto);
}
// Figure out if any reads within this group are duplicates of one another
for (int i = 0; i < seqs.size(); ++i) {
final PairedReadSequence lhs = seqs.get(i);
if (lhs == null) continue;
final List<PairedReadSequence> dupes = new ArrayList<PairedReadSequence>();
for (int j = i + 1; j < seqs.size(); ++j) {
final PairedReadSequence rhs = seqs.get(j);
if (rhs == null) continue;
if (matches(lhs, rhs, MAX_DIFF_RATE)) {
dupes.add(rhs);
seqs.set(j, null);
}
}
if (dupes.size() > 0) {
dupes.add(lhs);
final int duplicateCount = dupes.size();
duplicationHisto.increment(duplicateCount);
final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes);
for (final boolean b : flags) {
if (b) opticalHisto.increment(duplicateCount);
}
} else {
duplicationHisto.increment(1);
}
}
}
++groupsProcessed;
if (lastLogTime < System.currentTimeMillis() - 60000) {
log.info("Processed " + groupsProcessed + " groups.");
lastLogTime = System.currentTimeMillis();
}
}
}
iterator.close();
sorter.cleanup();
final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile();
for (final String library : duplicationHistosByLibrary.keySet()) {
final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library);
final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library);
final DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
// Filter out any bins that have only a single entry in them and calcu
for (final Integer bin : duplicationHisto.keySet()) {
final double duplicateGroups = duplicationHisto.get(bin).getValue();
final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();
if (duplicateGroups > 1) {
metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
}
}
metrics.calculateDerivedMetrics();
file.addMetric(metrics);
file.addHistogram(duplicationHisto);
}
file.write(OUTPUT);
return 0;
}
/**
* Checks to see if two reads pairs have sequence that are the same, give or take a few
* errors/diffs as dictated by the maxDiffRate.
*/
private boolean matches(final PairedReadSequence lhs, final PairedReadSequence rhs, final double maxDiffRate) {
final int read1Length = Math.min(lhs.read1.length, rhs.read1.length);
final int read2Length = Math.min(lhs.read2.length, rhs.read2.length);
final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate);
int errors = 0;
// The loop can start from MIN_IDENTICAL_BASES because we've already confirmed that
// at least those first few bases are identical when sorting.
for (int i = MIN_IDENTICAL_BASES; i < read1Length; ++i) {
if (lhs.read1[i] != rhs.read1[i]) {
if (++errors > maxErrors) return false;
}
}
for (int i = MIN_IDENTICAL_BASES; i < read2Length; ++i) {
if (lhs.read2[i] != rhs.read2[i]) {
if (++errors > maxErrors) return false;
}
}
return true;
}
/**
* Pulls out of the iterator the next group of reads that can be compared to each other to
* identify duplicates.
*/
List<PairedReadSequence> getNextGroup(final PeekableIterator<PairedReadSequence> iterator) {
final List<PairedReadSequence> group = new ArrayList<PairedReadSequence>();
final PairedReadSequence first = iterator.next();
group.add(first);
outer:
while (iterator.hasNext()) {
final PairedReadSequence next = iterator.peek();
for (int i = 0; i < MIN_IDENTICAL_BASES; ++i) {
if (first.read1[i] != next.read1[i] || first.read2[i] != next.read2[i]) break outer;
}
group.add(iterator.next());
}
return group;
}
/**
* Takes a list of PairedReadSequence objects and splits them into lists by library.
*/
Map<String, List<PairedReadSequence>> splitByLibrary(final List<PairedReadSequence> input,
final List<SAMReadGroupRecord> rgs) {
final Map<String, List<PairedReadSequence>> out = new HashMap<String, List<PairedReadSequence>>();
for (final PairedReadSequence seq : input) {
String library = null;
if (seq.getReadGroup() != -1) {
library = rgs.get(seq.getReadGroup()).getLibrary();
if (library == null) library = "Unknown";
} else {
library = "Unknown";
}
List<PairedReadSequence> librarySeqs = out.get(library);
if (librarySeqs == null) {
librarySeqs = new ArrayList<PairedReadSequence>();
out.put(library, librarySeqs);
}
librarySeqs.add(seq);
}
return out;
}
/**
* Checks that the average quality over the entire read is >= min, and that the first N bases do
* not contain any no-calls.
*/
boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int seedLength, final int minQuality) {
if (bases.length < seedLength) return false;
for (int i = 0; i < seedLength; ++i) {
if (SequenceUtil.isNoCall(bases[i])) return false;
}
int total = 0;
for (final byte b : quals) total += b;
return total / quals.length >= minQuality;
}
}