/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
/**
* Reorders a SAM/BAM input file according to the order of contigs in a second reference sequence
*
* @author mdepristo
*/
@CommandLineProgramProperties(
usage = "Not to be confused with SortSam which sorts a SAM or BAM file with a valid sequence dictionary, " +
"ReorderSam reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file, " +
"as determined by exact name matching of contigs. Reads mapped to contigs absent in the new " +
"reference are dropped. Runs substantially faster if the input is an indexed BAM file.",
usageShort = "Reorders reads in a SAM or BAM file to match ordering in reference",
programGroup = SamOrBam.class
)
public class ReorderSam extends CommandLineProgram {
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input file (bam or sam) to extract reads from.")
public File INPUT;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output file (bam or sam) to write extracted reads to.")
public File OUTPUT;
@Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc="Reference sequence to reorder reads to match. " +
"A sequence dictionary corresponding to the reference fasta is required. Create one with CreateSequenceDictionary.jar.")
public File REFERENCE;
@Option(shortName="S", doc="If true, then allows only a partial overlap of the BAM contigs with the new reference " +
"sequence contigs. By default, this tool requires a corresponding contig in the new " +
"reference for each read contig")
public boolean ALLOW_INCOMPLETE_DICT_CONCORDANCE = false;
@Option(shortName="U", doc="If true, then permits mapping from a read contig to a new reference contig with the " +
"same name but a different length. Highly dangerous, only use if you know what you " +
"are doing.")
public boolean ALLOW_CONTIG_LENGTH_DISCORDANCE = false;
private final Log log = Log.getInstance(ReorderSam.class);
/** Required main method implementation. */
public static void main(final String[] argv) {
new ReorderSam().instanceMainWithExit(argv);
}
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE);
IOUtil.assertFileIsWritable(OUTPUT);
final SAMFileReader in = new SAMFileReader(INPUT);
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE);
SAMSequenceDictionary refDict = reference.getSequenceDictionary();
if (refDict == null) {
log.error("No reference sequence dictionary found. Aborting. You can create a sequence dictionary for the reference fasta using CreateSequenceDictionary.jar.");
in.close();
return 1;
}
printDictionary("SAM/BAM file", in.getFileHeader().getSequenceDictionary());
printDictionary("Reference", refDict);
Map<Integer, Integer> newOrder = buildSequenceDictionaryMap(refDict, in.getFileHeader().getSequenceDictionary());
// has to be after we create the newOrder
SAMFileHeader outHeader = in.getFileHeader().clone();
outHeader.setSequenceDictionary(refDict);
log.info("Writing reads...");
if (in.hasIndex()) {
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, true, OUTPUT);
// write the reads in contig order
for (final SAMSequenceRecord contig : refDict.getSequences() ) {
final SAMRecordIterator it = in.query(contig.getSequenceName(), 0, 0, false);
writeReads(out, it, newOrder, contig.getSequenceName());
}
// don't forget the unmapped reads
writeReads( out, in.queryUnmapped(), newOrder, "unmapped" );
out.close();
}
else {
SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, false, OUTPUT);
writeReads(out, in.iterator(), newOrder, "All reads");
out.close();
}
// cleanup
in.close();
return 0;
}
/**
* Low-level helper function that returns the new reference index for oldIndex according to the
* ordering map newOrder. Read is provided in case an error occurs, so that an informative message
* can be made.
*/
private int newOrderIndex(SAMRecord read, int oldIndex, Map<Integer, Integer> newOrder) {
if ( oldIndex == -1 )
return -1; // unmapped read
else {
final Integer n = newOrder.get(oldIndex);
if (n == null) throw new PicardException("BUG: no mapping found for read " + read.format());
else return n;
}
}
/**
* Helper function that writes reads from iterator it into writer out, updating each SAMRecord along the way
* according to the newOrder mapping from dictionary index -> index. Name is used for printing only.
*/
private void writeReads(final SAMFileWriter out,
final SAMRecordIterator it,
final Map<Integer, Integer> newOrder,
final String name) {
long counter = 0;
log.info(" Processing " + name);
while ( it.hasNext() ) {
counter++;
final SAMRecord read = it.next();
final int oldRefIndex = read.getReferenceIndex();
final int oldMateIndex = read.getMateReferenceIndex();
final int newRefIndex = newOrderIndex(read, oldRefIndex, newOrder);
read.setHeader(out.getFileHeader());
read.setReferenceIndex(newRefIndex);
final int newMateIndex = newOrderIndex(read, oldMateIndex, newOrder);
if ( oldMateIndex != -1 && newMateIndex == -1 ) { // becoming unmapped
read.setMateAlignmentStart(0);
read.setMateUnmappedFlag(true);
read.setAttribute(SAMTag.MC.name(), null); // Set the Mate Cigar String to null
}
read.setMateReferenceIndex(newMateIndex);
out.addAlignment(read);
}
it.close();
log.info("Wrote " + counter + " reads");
}
/**
* Constructs a mapping from read sequence records index -> new sequence dictionary index for use in
* reordering the reference index and mate reference index in each read. -1 means unmapped.
*/
private Map<Integer, Integer> buildSequenceDictionaryMap(final SAMSequenceDictionary refDict,
final SAMSequenceDictionary readsDict) {
Map<Integer, Integer> newOrder = new HashMap<Integer, Integer>();
log.info("Reordering SAM/BAM file:");
for (final SAMSequenceRecord refRec : refDict.getSequences() ) {
final SAMSequenceRecord readsRec = readsDict.getSequence(refRec.getSequenceName());
if (readsRec != null) {
if ( refRec.getSequenceLength() != readsRec.getSequenceLength() ) {
String msg = String.format("Discordant contig lengths: read %s LN=%d, ref %s LN=%d",
readsRec.getSequenceName(), readsRec.getSequenceLength(),
refRec.getSequenceName(), refRec.getSequenceLength());
if ( ALLOW_CONTIG_LENGTH_DISCORDANCE ) {
log.warn(msg);
}
else {
throw new PicardException(msg);
}
}
log.info(String.format(" Reordering read contig %s [index=%d] to => ref contig %s [index=%d]%n",
readsRec.getSequenceName(), readsRec.getSequenceIndex(),
refRec.getSequenceName(), refRec.getSequenceIndex() ));
newOrder.put(readsRec.getSequenceIndex(), refRec.getSequenceIndex());
}
}
for ( SAMSequenceRecord readsRec : readsDict.getSequences() ) {
if ( ! newOrder.containsKey(readsRec.getSequenceIndex()) ) {
if ( ALLOW_INCOMPLETE_DICT_CONCORDANCE )
newOrder.put(readsRec.getSequenceIndex(), -1);
else
throw new PicardException("New reference sequence does not contain a matching contig for " + readsRec.getSequenceName());
}
}
return newOrder;
}
/**
* Helper function to print out a sequence dictionary
*/
private void printDictionary(String name, SAMSequenceDictionary dict) {
log.info(name);
for (final SAMSequenceRecord contig : dict.getSequences()) {
log.info( " SN=%s LN=%d%n", contig.getSequenceName(), contig.getSequenceLength());
}
}
}