/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SecondaryOrSupplementarySkippingIterator;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.PositionalArguments;
import picard.cmdline.programgroups.SamOrBam;
import java.io.File;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Rudimentary SAM comparer. Compares headers, and if headers are compatible enough, compares SAMRecords,
* looking only at basic alignment info. Summarizes the number of alignments that match, mismatch, are missing, etc.
* @author alecw@broadinstitute.org
*/
@CommandLineProgramProperties(
usage = "USAGE: CompareSAMS <SAMFile1> <SAMFile2>\n" +
"Compares the headers of the two input SAM or BAM files, and, if possible, the SAMRecords. " +
"For SAMRecords, compares only the readUnmapped flag, reference name, start position and strand. " +
"Reports the number of SAMRecords that match, differ in alignment, are mapped in only one input, " +
"or are missing in one of the files",
usageShort = "Compares two input SAM or BAM files",
programGroup = SamOrBam.class
)
public class CompareSAMs extends CommandLineProgram {
@PositionalArguments(minElements = 2, maxElements = 2)
public List<File> samFiles;
private final SAMFileReader[] samReaders = new SAMFileReader[2];
private boolean sequenceDictionariesDiffer;
private int mappingsMatch = 0;
private int unmappedBoth = 0;
private int unmappedLeft = 0;
private int unmappedRight = 0;
private int mappingsDiffer = 0;
private int missingLeft = 0;
private int missingRight = 0;
private boolean areEqual;
public static void main(String[] argv) {
new CompareSAMs().instanceMainWithExit(argv);
}
/**
* Do the work after command line has been parsed. RuntimeException may be
* thrown by this method, and are reported appropriately.
*
* @return program exit status.
*/
@Override
protected int doWork() {
for (int i = 0; i < samFiles.size(); ++i) {
samReaders[i] = new SAMFileReader(samFiles.get(i));
}
areEqual = compareHeaders();
areEqual = compareAlignments() && areEqual;
printReport();
if (!areEqual) {
System.out.println("SAM files differ.");
} else {
System.out.println("SAM files match.");
}
return 0;
}
private void printReport() {
System.out.println("Match\t" + mappingsMatch);
System.out.println("Differ\t" + mappingsDiffer);
System.out.println("Unmapped_both\t" + unmappedBoth);
System.out.println("Unmapped_left\t" + unmappedLeft);
System.out.println("Unmapped_right\t" + unmappedRight);
System.out.println("Missing_left\t" + missingLeft);
System.out.println("Missing_right\t" + missingRight);
}
private boolean compareAlignments() {
if (!compareValues(samReaders[0].getFileHeader().getSortOrder(), samReaders[1].getFileHeader().getSortOrder(),
"Sort Order")) {
System.out.println("Cannot compare alignments if sort orders differ.");
return false;
}
switch (samReaders[0].getFileHeader().getSortOrder()) {
case coordinate:
if (sequenceDictionariesDiffer) {
System.out.println("Cannot compare coordinate-sorted SAM files because sequence dictionaries differ.");
return false;
}
return compareCoordinateSortedAlignments();
case queryname:
return compareQueryNameSortedAlignments();
case unsorted:
return compareUnsortedAlignments();
default:
// unreachable
return false;
}
}
private boolean compareCoordinateSortedAlignments() {
final SecondaryOrSupplementarySkippingIterator itLeft =
new SecondaryOrSupplementarySkippingIterator(samReaders[0].iterator());
final SecondaryOrSupplementarySkippingIterator itRight =
new SecondaryOrSupplementarySkippingIterator(samReaders[1].iterator());
// Save any reads which haven't been matched during in-order scan.
final Map<String, SAMRecord> leftUnmatched = new HashMap<String, SAMRecord>();
final Map<String, SAMRecord> rightUnmatched = new HashMap<String, SAMRecord>();
boolean ret = true;
while (itLeft.hasCurrent()) {
if (!itRight.hasCurrent()) {
// Exhausted right side. See if any of the remaining left reads match
// any of the saved right reads.
for( ; itLeft.hasCurrent(); itLeft.advance()) {
final SAMRecord left = itLeft.getCurrent();
final SAMRecord right = rightUnmatched.remove(left.getReadName());
if (right == null) {
++missingRight;
} else {
tallyAlignmentRecords(left, right);
}
}
break;
}
// Don't assume stability of order beyond the coordinate. Therefore grab all the
// reads from the left that has the same coordinate.
final SAMRecord left = itLeft.getCurrent();
final Map<String, SAMRecord> leftCurrentCoordinate = new HashMap<String, SAMRecord>();
leftCurrentCoordinate.put(left.getReadName(), left);
while (itLeft.advance()) {
final SAMRecord nextLeft = itLeft.getCurrent();
if (compareAlignmentCoordinates(left, nextLeft) == 0) {
leftCurrentCoordinate.put(nextLeft.getReadName(), nextLeft);
} else {
break;
}
}
// Advance the right iterator until it is >= the left reads that have just been grabbed
while (itRight.hasCurrent() && compareAlignmentCoordinates(left, itRight.getCurrent()) > 0) {
final SAMRecord right = itRight.getCurrent();
rightUnmatched.put(right.getReadName(), right);
itRight.advance();
}
// For each right read that has the same coordinate as the current left reads,
// see if there is a matching left read. If so, process and discard. If not,
// save the right read for later.
for (;itRight.hasCurrent() && compareAlignmentCoordinates(left, itRight.getCurrent()) == 0; itRight.advance()) {
final SAMRecord right = itRight.getCurrent();
final SAMRecord matchingLeft = leftCurrentCoordinate.remove(right.getReadName());
if (matchingLeft != null) {
ret = tallyAlignmentRecords(matchingLeft, right) && ret;
} else {
rightUnmatched.put(right.getReadName(), right);
}
}
// Anything left in leftCurrentCoordinate has not been matched
for (final SAMRecord samRecord : leftCurrentCoordinate.values()) {
leftUnmatched.put(samRecord.getReadName(), samRecord);
}
}
// The left iterator has been exhausted. See if any of the remaining right reads
// match any of the saved left reads.
for( ; itRight.hasCurrent(); itRight.advance()) {
final SAMRecord right = itRight.getCurrent();
final SAMRecord left = leftUnmatched.remove(right.getReadName());
if (left != null) {
tallyAlignmentRecords(left, right);
} else {
++missingLeft;
}
}
// Look up reads that were unmatched from left, and see if they are in rightUnmatched.
// If found, remove from rightUnmatched and tally.
for (final Map.Entry<String, SAMRecord> leftEntry : leftUnmatched.entrySet()) {
final String readName = leftEntry.getKey();
final SAMRecord left = leftEntry.getValue();
final SAMRecord right = rightUnmatched.remove(readName);
if (right == null) {
++missingRight;
continue;
}
tallyAlignmentRecords(left, right);
}
// Any elements remaining in rightUnmatched are guaranteed not to be in leftUnmatched.
missingLeft += rightUnmatched.size();
if (ret) {
if (missingLeft > 0 || missingRight > 0 || mappingsDiffer > 0 || unmappedLeft > 0 || unmappedRight > 0) {
ret = false;
}
}
return ret;
}
private int compareAlignmentCoordinates(final SAMRecord left, final SAMRecord right) {
final String leftReferenceName = left.getReferenceName();
final String rightReferenceName = right.getReferenceName();
if (leftReferenceName == null && rightReferenceName == null) {
return 0;
} else if (leftReferenceName == null) {
return 1;
} else if (rightReferenceName == null) {
return -1;
}
final int leftReferenceIndex = samReaders[0].getFileHeader().getSequenceIndex(leftReferenceName);
final int rightReferenceIndex = samReaders[0].getFileHeader().getSequenceIndex(rightReferenceName);
if (leftReferenceIndex != rightReferenceIndex) {
return leftReferenceIndex - rightReferenceIndex;
}
return left.getAlignmentStart() - right.getAlignmentStart();
}
private boolean compareQueryNameSortedAlignments() {
final SecondaryOrSupplementarySkippingIterator it1 = new SecondaryOrSupplementarySkippingIterator(samReaders[0].iterator());
final SecondaryOrSupplementarySkippingIterator it2 = new SecondaryOrSupplementarySkippingIterator(samReaders[1].iterator());
boolean ret = true;
while (it1.hasCurrent()) {
if (!it2.hasCurrent()) {
missingRight += countRemaining(it1);
return false;
}
final int cmp = it1.getCurrent().getReadName().compareTo(it2.getCurrent().getReadName());
if (cmp < 0) {
++missingRight;
it1.advance();
ret = false;
} else if (cmp > 0) {
++missingLeft;
it2.advance();
ret = false;
} else {
if (!tallyAlignmentRecords(it1.getCurrent(), it2.getCurrent())) {
ret = false;
}
it1.advance();
it2.advance();
}
}
if (it2.hasCurrent()) {
missingLeft += countRemaining(it2);
return false;
}
return ret;
}
private boolean compareUnsortedAlignments() {
final SecondaryOrSupplementarySkippingIterator it1 = new SecondaryOrSupplementarySkippingIterator(samReaders[0].iterator());
final SecondaryOrSupplementarySkippingIterator it2 = new SecondaryOrSupplementarySkippingIterator(samReaders[1].iterator());
boolean ret = true;
for (; it1.hasCurrent(); it1.advance(), it2.advance()) {
if (!it2.hasCurrent()) {
missingRight += countRemaining(it1);
return false;
}
final SAMRecord s1 = it1.getCurrent();
final SAMRecord s2 = it2.getCurrent();
if (!compareValues(s1.getReadName(), s2.getReadName(), "Read names")) {
System.out.println("Read names cease agreeing in unsorted SAM files . Comparison aborting.");
}
ret = tallyAlignmentRecords(s1, s2) && ret;
}
if (it2.hasCurrent()) {
missingLeft += countRemaining(it2);
return false;
}
return ret;
}
private int countRemaining(final SecondaryOrSupplementarySkippingIterator it) {
int i;
for (i = 0; it.hasCurrent(); ++i) {
it.advance();
}
return i;
}
private boolean tallyAlignmentRecords(final SAMRecord s1, final SAMRecord s2) {
if (!s1.getReadName().equals(s2.getReadName())) {
throw new PicardException("Read names do not match: " + s1.getReadName() + " : " + s2.getReadName());
}
if (s1.getReadUnmappedFlag() && s2.getReadUnmappedFlag()) {
++unmappedBoth;
return true;
}
if (s1.getReadUnmappedFlag()) {
++unmappedLeft;
return false;
}
if (s2.getReadUnmappedFlag()) {
++unmappedRight;
return false;
}
final boolean ret = (s1.getReferenceName().equals(s2.getReferenceName()) &&
s1.getAlignmentStart() == s2.getAlignmentStart() &&
s1.getReadNegativeStrandFlag() == s1.getReadNegativeStrandFlag());
if (!ret) {
++mappingsDiffer;
} else {
++mappingsMatch;
}
return ret;
}
private boolean compareHeaders() {
final SAMFileHeader h1 = samReaders[0].getFileHeader();
final SAMFileHeader h2 = samReaders[1].getFileHeader();
boolean ret = compareValues(h1.getVersion(), h2.getVersion(), "File format version");
ret = compareValues(h1.getCreator(), h2.getCreator(), "File creator") && ret;
ret = compareValues(h1.getAttribute("SO"), h2.getAttribute("SO"), "Sort order") && ret;
if (!compareSequenceDictionaries(h1, h2)) {
ret = false;
sequenceDictionariesDiffer = true;
}
ret = compareReadGroups(h1, h2) && ret;
ret = compareProgramRecords(h1, h2) && ret;
return ret;
}
private boolean compareProgramRecords(final SAMFileHeader h1, final SAMFileHeader h2) {
final List<SAMProgramRecord> l1 = h1.getProgramRecords();
final List<SAMProgramRecord> l2 = h2.getProgramRecords();
if (!compareValues(l1.size(), l2.size(), "Number of program records")) {
return false;
}
boolean ret = true;
for (int i = 0; i < l1.size(); ++i) {
ret = compareProgramRecord(l1.get(i), l2.get(i)) && ret;
}
return ret;
}
private boolean compareProgramRecord(final SAMProgramRecord programRecord1, final SAMProgramRecord programRecord2) {
if (programRecord1 == null && programRecord2 == null) {
return true;
}
if (programRecord1 == null) {
reportDifference("null", programRecord2.getProgramGroupId(), "Program Record");
return false;
}
if (programRecord2 == null) {
reportDifference(programRecord1.getProgramGroupId(), "null", "Program Record");
return false;
}
boolean ret = compareValues(programRecord1.getProgramGroupId(), programRecord2.getProgramGroupId(),
"Program Name");
final String[] attributes = {"VN", "CL"};
for (final String attribute: attributes) {
ret = compareValues(programRecord1.getAttribute(attribute), programRecord2.getAttribute(attribute),
attribute + " Program Record attribute") && ret;
}
return ret;
}
private boolean compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2) {
final List<SAMReadGroupRecord> l1 = h1.getReadGroups();
final List<SAMReadGroupRecord> l2 = h2.getReadGroups();
if (!compareValues(l1.size(), l2.size(), "Number of read groups")) {
return false;
}
boolean ret = true;
for (int i = 0; i < l1.size(); ++i) {
ret = compareReadGroup(l1.get(i), l2.get(i)) && ret;
}
return ret;
}
private boolean compareReadGroup(final SAMReadGroupRecord samReadGroupRecord1, final SAMReadGroupRecord samReadGroupRecord2) {
boolean ret = compareValues(samReadGroupRecord1.getReadGroupId(), samReadGroupRecord2.getReadGroupId(),
"Read Group ID");
ret = compareValues(samReadGroupRecord1.getSample(), samReadGroupRecord2.getSample(),
"Sample for read group " + samReadGroupRecord1.getReadGroupId()) && ret;
ret = compareValues(samReadGroupRecord1.getLibrary(), samReadGroupRecord2.getLibrary(),
"Library for read group " + samReadGroupRecord1.getReadGroupId()) && ret;
final String[] attributes = {"DS", "PU", "PI", "CN", "DT", "PL"};
for (final String attribute : attributes) {
ret = compareValues(samReadGroupRecord1.getAttribute(attribute), samReadGroupRecord2.getAttribute(attribute),
attribute + " for read group " + samReadGroupRecord1.getReadGroupId()) && ret;
}
return ret;
}
private boolean compareSequenceDictionaries(final SAMFileHeader h1, final SAMFileHeader h2) {
final List<SAMSequenceRecord> s1 = h1.getSequenceDictionary().getSequences();
final List<SAMSequenceRecord> s2 = h2.getSequenceDictionary().getSequences();
if (s1.size() != s2.size()) {
reportDifference(s1.size(), s2.size(), "Length of sequence dictionaries");
return false;
}
boolean ret = true;
for (int i = 0; i < s1.size(); ++i) {
ret = compareSequenceRecord(s1.get(i), s2.get(i), i+1) && ret;
}
return ret;
}
private boolean compareSequenceRecord(final SAMSequenceRecord sequenceRecord1, final SAMSequenceRecord sequenceRecord2, final int which) {
if (!sequenceRecord1.getSequenceName().equals(sequenceRecord2.getSequenceName())) {
reportDifference(sequenceRecord1.getSequenceName(), sequenceRecord2.getSequenceName(),
"Name of sequence record " + which);
return false;
}
boolean ret = compareValues(sequenceRecord1.getSequenceLength(), sequenceRecord2.getSequenceLength(), "Length of sequence " +
sequenceRecord1.getSequenceName());
ret = compareValues(sequenceRecord1.getSpecies(), sequenceRecord2.getSpecies(), "Species of sequence " +
sequenceRecord1.getSequenceName()) && ret;
ret = compareValues(sequenceRecord1.getAssembly(), sequenceRecord2.getAssembly(), "Assembly of sequence " +
sequenceRecord1.getSequenceName()) && ret;
ret = compareValues(sequenceRecord1.getAttribute("M5"), sequenceRecord2.getAttribute("M5"), "MD5 of sequence " +
sequenceRecord1.getSequenceName()) && ret;
ret = compareValues(sequenceRecord1.getAttribute("UR"), sequenceRecord2.getAttribute("UR"), "URI of sequence " +
sequenceRecord1.getSequenceName()) && ret;
return ret;
}
private <T> boolean compareValues(final T v1, final T v2, final String label) {
if (v1 == null) {
if (v2 == null) {
return true;
}
reportDifference(v1, v2, label);
return false;
}
if (v2 == null) {
reportDifference(v1, v2, label);
return false;
}
if (!v1.equals(v2)) {
reportDifference(v1, v2, label);
return false;
}
return true;
}
private void reportDifference(final String s1, final String s2, final String label) {
System.out.println(label + " differs.");
System.out.println(samFiles.get(0) + ": " + s1);
System.out.println(samFiles.get(1) + ": " + s2);
}
private void reportDifference(Object o1, Object o2, final String label) {
if (o1 == null) {
o1 = "null";
}
if (o2 == null) {
o2 = "null";
}
reportDifference(o1.toString(), o2.toString(), label);
}
public int getMappingsMatch() {
return mappingsMatch;
}
public int getUnmappedBoth() {
return unmappedBoth;
}
public int getUnmappedLeft() {
return unmappedLeft;
}
public int getUnmappedRight() {
return unmappedRight;
}
public int getMappingsDiffer() {
return mappingsDiffer;
}
public int getMissingLeft() {
return missingLeft;
}
public int getMissingRight() {
return missingRight;
}
public boolean areEqual() {
return areEqual;
}
}