/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina;
import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Illumina;
import picard.cmdline.StandardOptionDefinitions;
import picard.util.AdapterMarker;
import picard.util.AdapterPair;
import picard.util.ClippingUtility;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import static picard.util.IlluminaUtil.IlluminaAdapterPair;
/**
* Command line program to mark the location of adapter sequences.
* This also outputs a Histogram of metrics describing the clipped bases
*
* @author Tim Fennell (adapted by mborkan@broadinstitute.org)
*/
@CommandLineProgramProperties(
usage = "Reads a SAM or BAM file and rewrites it with new adapter-trimming tags.\n" +
"Clear any existing adapter-trimming tags (XT:i:).\n" +
"Only works for unaligned files in query-name order.\n"+
"Note: This is a utility program and will not be run in the pipeline.\n",
usageShort = "Reads a SAM or BAM file and rewrites it with new adapter-trimming tags",
programGroup = Illumina.class
)
public class MarkIlluminaAdapters extends CommandLineProgram {
// The following attributes define the command-line arguments
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;
@Option(doc="If output is not specified, just the metrics are generated",
shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional=true)
public File OUTPUT;
@Option(doc="Histogram showing counts of bases_clipped in how many reads", shortName="M")
public File METRICS;
@Option(doc="The minimum number of bases to match over when clipping single-end reads.")
public int MIN_MATCH_BASES_SE = ClippingUtility.MIN_MATCH_BASES;
@Option(doc="The minimum number of bases to match over (per-read) when clipping paired-end reads.")
public int MIN_MATCH_BASES_PE = ClippingUtility.MIN_MATCH_PE_BASES;
@Option(doc="The maximum mismatch error rate to tolerate when clipping single-end reads.")
public double MAX_ERROR_RATE_SE = ClippingUtility.MAX_ERROR_RATE;
@Option(doc="The maximum mismatch error rate to tolerate when clipping paired-end reads.")
public double MAX_ERROR_RATE_PE = ClippingUtility.MAX_PE_ERROR_RATE;
@Option(doc="DEPRECATED. Whether this is a paired-end run. No longer used.", shortName="PE", optional=true)
public Boolean PAIRED_RUN;
@Option(doc="Which adapters sequences to attempt to identify and clip.")
public List<IlluminaAdapterPair> ADAPTERS =
CollectionUtil.makeList(IlluminaAdapterPair.INDEXED,
IlluminaAdapterPair.DUAL_INDEXED,
IlluminaAdapterPair.PAIRED_END
);
@Option(doc="For specifying adapters other than standard Illumina", optional=true)
public String FIVE_PRIME_ADAPTER;
@Option(doc="For specifying adapters other than standard Illumina", optional=true)
public String THREE_PRIME_ADAPTER;
@Option(doc="Adapters are truncated to this length to speed adapter matching. Set to a large number to effectively disable truncation.")
public int ADAPTER_TRUNCATION_LENGTH = AdapterMarker.DEFAULT_ADAPTER_LENGTH;
@Option(doc="If looking for multiple adapter sequences, then after having seen this many adapters, shorten the list of sequences. " +
"Keep the adapters that were found most frequently in the input so far. " +
"Set to -1 if the input has a heterogeneous mix of adapters so shortening is undesirable.",
shortName = "APT")
public int PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN = AdapterMarker.DEFAULT_PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN;
@Option(doc="If pruning the adapter list, keep only this many adapter sequences when pruning the list (plus any adapters that " +
"were tied with the adapters being kept).")
public int NUM_ADAPTERS_TO_KEEP = AdapterMarker.DEFAULT_NUM_ADAPTERS_TO_KEEP;
private static final Log log = Log.getInstance(MarkIlluminaAdapters.class);
// Stock main method
public static void main(final String[] args) {
System.exit(new MarkIlluminaAdapters().instanceMain(args));
}
@Override
protected String[] customCommandLineValidation() {
if ((FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null)) {
return new String[] {"Either both or neither of THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must be set."};
}
else {
return null;
}
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(METRICS);
final SAMFileReader in = new SAMFileReader(INPUT);
final SAMFileHeader.SortOrder order = in.getFileHeader().getSortOrder();
SAMFileWriter out = null;
if (OUTPUT != null) {
IOUtil.assertFileIsWritable(OUTPUT);
out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
}
final Histogram<Integer> histo = new Histogram<Integer>("clipped_bases", "read_count");
// Combine any adapters and custom adapter pairs from the command line into an array for use in clipping
final AdapterPair[] adapters;
{
final List<AdapterPair> tmp = new ArrayList<AdapterPair>();
tmp.addAll(ADAPTERS);
if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
tmp.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
}
adapters = tmp.toArray(new AdapterPair[tmp.size()]);
}
////////////////////////////////////////////////////////////////////////
// Main loop that consumes reads, clips them and writes them to the output
////////////////////////////////////////////////////////////////////////
final ProgressLogger progress = new ProgressLogger(log, 1000000, "Read");
final SAMRecordIterator iterator = in.iterator();
final AdapterMarker adapterMarker = new AdapterMarker(ADAPTER_TRUNCATION_LENGTH, adapters).
setMaxPairErrorRate(MAX_ERROR_RATE_PE).setMinPairMatchBases(MIN_MATCH_BASES_PE).
setMaxSingleEndErrorRate(MAX_ERROR_RATE_SE).setMinSingleEndMatchBases(MIN_MATCH_BASES_SE).
setNumAdaptersToKeep(NUM_ADAPTERS_TO_KEEP).
setThresholdForSelectingAdaptersToKeep(PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN);
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
final SAMRecord rec2 = rec.getReadPairedFlag() && iterator.hasNext() ? iterator.next() : null;
rec.setAttribute(ReservedTagConstants.XT, null);
// Do the clipping one way for PE and another for SE reads
if (rec.getReadPairedFlag()) {
// Assert that the input file is in query name order only if we see some PE reads
if (order != SAMFileHeader.SortOrder.queryname) {
throw new PicardException("Input BAM file must be sorted by queryname");
}
if (rec2 == null) throw new PicardException("Missing mate pair for paired read: " + rec.getReadName());
rec2.setAttribute(ReservedTagConstants.XT, null);
// Assert that we did in fact just get two mate pairs
if (!rec.getReadName().equals(rec2.getReadName())){
throw new PicardException("Adjacent reads expected to be mate-pairs have different names: " +
rec.getReadName() + ", " + rec2.getReadName());
}
// establish which of pair is first and which second
final SAMRecord first, second;
if (rec.getFirstOfPairFlag() && rec2.getSecondOfPairFlag()){
first = rec;
second = rec2;
}
else if (rec.getSecondOfPairFlag() && rec2.getFirstOfPairFlag()) {
first = rec2;
second = rec;
}
else {
throw new PicardException("Two reads with same name but not correctly marked as 1st/2nd of pair: " + rec.getReadName());
}
adapterMarker.adapterTrimIlluminaPairedReads(first, second);
}
else {
adapterMarker.adapterTrimIlluminaSingleRead(rec);
}
// Then output the records, update progress and metrics
for (final SAMRecord r : new SAMRecord[] {rec, rec2}) {
if (r != null) {
progress.record(r);
if (out != null) out.addAlignment(r);
final Integer clip = rec.getIntegerAttribute(ReservedTagConstants.XT);
if (clip != null) histo.increment(rec.getReadLength() - clip + 1);
}
}
}
if (out != null) out.close();
// Lastly output the metrics to file
final MetricsFile<?,Integer> metricsFile = getMetricsFile();
metricsFile.setHistogram(histo);
metricsFile.write(METRICS);
return 0;
}
private class CustomAdapterPair implements AdapterPair {
final String fivePrime, threePrime, fivePrimeReadOrder;
final byte[] fivePrimeBytes, threePrimeBytes, fivePrimeReadOrderBytes;
private CustomAdapterPair(final String fivePrime, final String threePrime) {
this.threePrime = threePrime;
this.threePrimeBytes = StringUtil.stringToBytes(threePrime);
this.fivePrime = fivePrime;
this.fivePrimeReadOrder = SequenceUtil.reverseComplement(fivePrime);
this.fivePrimeBytes = StringUtil.stringToBytes(fivePrime);
this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder);
}
public String get3PrimeAdapter(){ return threePrime; }
public String get5PrimeAdapter(){ return fivePrime; }
public String get3PrimeAdapterInReadOrder(){ return threePrime; }
public String get5PrimeAdapterInReadOrder() { return fivePrimeReadOrder; }
public byte[] get3PrimeAdapterBytes() { return threePrimeBytes; }
public byte[] get5PrimeAdapterBytes() { return fivePrimeBytes; }
public byte[] get3PrimeAdapterBytesInReadOrder() { return threePrimeBytes; }
public byte[] get5PrimeAdapterBytesInReadOrder() { return fivePrimeReadOrderBytes; }
public String getName() { return "Custom adapter pair"; }
}
}