/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina;
import htsjdk.samtools.BAMRecordCodec;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordQueryNameComparator;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Iso8601Date;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Illumina;
import picard.cmdline.StandardOptionDefinitions;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.util.IlluminaUtil;
import picard.util.IlluminaUtil.IlluminaAdapterPair;
import picard.util.TabbedTextFileWithHeaderParser;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* IlluminaBasecallsToSam transforms a lane of Illumina data file formats (bcl, locs, clocs, qseqs, etc.) into
* SAM or BAM file format.
* <p/>
* In this application, barcode data is read from Illumina data file groups, each of which is associated with a tile.
* Each tile may contain data for any number of barcodes, and a single barcode's data may span multiple tiles. Once the
* barcode data is collected from files, each barcode's data is written to its own SAM/BAM. The barcode data must be
* written in order; this means that barcode data from each tile is sorted before it is written to file, and that if a
* barcode's data does span multiple tiles, data collected from each tile must be written in the order of the tiles
* themselves.
* <p/>
* This class employs a number of private subclasses to achieve this goal. The TileReadAggregator controls the flow
* of operation. It is fed a number of Tiles which it uses to spawn TileReaders. TileReaders are responsible for
* reading Illumina data for their respective tiles from disk, and as they collect that data, it is fed back into the
* TileReadAggregator. When a TileReader completes a tile, it notifies the TileReadAggregator, which reviews what was
* read and conditionally queues its writing to disk, baring in mind the requirements of write-order described in the
* previous paragraph. As writes complete, the TileReadAggregator re-evalutes the state of reads/writes and may queue
* more writes. When all barcodes for all tiles have been written, the TileReadAggregator shuts down.
* <p/>
* The TileReadAggregator controls task execution using a specialized ThreadPoolExecutor. It accepts special Runnables
* of type PriorityRunnable which allow a priority to be assigned to the runnable. When the ThreadPoolExecutor is
* assigning threads, it gives priority to those PriorityRunnables with higher priority values. In this application,
* TileReaders are assigned lowest priority, and write tasks are assigned high priority. It is designed in this fashion
* to minimize the amount of time data must remain in memory (write the data as soon as possible, then discard it from
* memory) while maximizing CPU usage.
*
* @author jburke@broadinstitute.org
* @author mccowan@broadinstitute.org
*/
@CommandLineProgramProperties(
usage = IlluminaBasecallsToSam.USAGE,
usageShort = IlluminaBasecallsToSam.USAGE,
programGroup = Illumina.class
)
public class IlluminaBasecallsToSam extends CommandLineProgram {
// The following attributes define the command-line arguments
public static final String USAGE = "Generate a SAM or BAM file from data in an Illumina basecalls output directory";
@Option(doc = "The basecalls directory. ", shortName = "B")
public File BASECALLS_DIR;
@Option(doc = "The barcodes directory with _barcode.txt files (generated by ExtractIlluminaBarcodes). If not set, use BASECALLS_DIR. ", shortName = "BCD", optional = true)
public File BARCODES_DIR;
@Option(doc = "Lane number. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME)
public Integer LANE;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). The output SAM or BAM file. Format is determined by extension.",
shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"})
public File OUTPUT;
@Option(doc = "The barcode of the run. Prefixed to read names.")
public String RUN_BARCODE;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). The name of the sequenced sample",
shortName = StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME,
mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"})
public String SAMPLE_ALIAS;
@Option(doc = "ID used to link RG header record with RG tag in SAM record. " +
"If these are unique in SAM files that get merged, merge performance is better. " +
"If not specified, READ_GROUP_ID will be set to <first 5 chars of RUN_BARCODE>.<LANE> .",
shortName = StandardOptionDefinitions.READ_GROUP_ID_SHORT_NAME, optional = true)
public String READ_GROUP_ID;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). The name of the sequenced library",
shortName = StandardOptionDefinitions.LIBRARY_NAME_SHORT_NAME,
optional = true,
mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"})
public String LIBRARY_NAME;
@Option(doc = "The name of the sequencing center that produced the reads. Used to set the RG.CN tag.", optional = true)
public String SEQUENCING_CENTER = "BI";
@Option(doc = "The start date of the run.", optional = true)
public Date RUN_START_DATE;
@Option(doc = "The name of the sequencing technology that produced the read.", optional = true)
public String PLATFORM = "illumina";
@Option(doc = ReadStructure.PARAMETER_DOC, shortName = "RS")
public String READ_STRUCTURE;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). Tab-separated file for creating all output BAMs for barcoded run " +
"with single IlluminaBasecallsToSam invocation. Columns are BARCODE, OUTPUT, SAMPLE_ALIAS, and " +
"LIBRARY_NAME. Row with BARCODE=N is used to specify a file for no barcode match",
mutex = {"OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "LIBRARY_PARAMS"})
public File BARCODE_PARAMS;
@Option(doc = "Tab-separated file for creating all output BAMs for a lane with single IlluminaBasecallsToSam " +
"invocation. The columns are OUTPUT, SAMPLE_ALIAS, and LIBRARY_NAME, BARCODE_1, BARCODE_2 ... BARCODE_X " +
"where X = number of barcodes per cluster (optional). Row with BARCODE_1 set to 'N' is used to specify a file " +
"for no barcode match. You may also provide any 2 letter RG header attributes (excluding PU, CN, PL, and" +
" DT) as columns in this file and the values for those columns will be inserted into the RG tag for the" +
" BAM file created for a given row.",
mutex = {"OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "BARCODE_PARAMS"})
public File LIBRARY_PARAMS;
@Option(doc = "Which adapters to look for in the read.")
public List<IlluminaAdapterPair> ADAPTERS_TO_CHECK = new ArrayList<IlluminaAdapterPair>(
Arrays.asList(IlluminaAdapterPair.INDEXED,
IlluminaAdapterPair.DUAL_INDEXED,
IlluminaAdapterPair.NEXTERA_V2,
IlluminaAdapterPair.FLUIDIGM));
@Option(doc = "The number of threads to run in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " +
"the number of cores available on the machine. If NUM_PROCESSORS < 0, then the number of cores used will" +
" be the number available on the machine less NUM_PROCESSORS.")
public Integer NUM_PROCESSORS = 0;
@Option(doc = "If set, this is the first tile to be processed (used for debugging). Note that tiles are not processed" +
" in numerical order.",
optional = true)
public Integer FIRST_TILE;
@Option(doc = "If set, process no more than this many tiles (used for debugging).", optional = true)
public Integer TILE_LIMIT;
@Option(doc = "If true, call System.gc() periodically. This is useful in cases in which the -Xmx value passed " +
"is larger than the available memory.")
public Boolean FORCE_GC = true;
@Option(doc="Apply EAMSS filtering to identify inappropriately quality scored bases towards the ends of reads" +
" and convert their quality scores to Q2.")
public boolean APPLY_EAMSS_FILTER = true;
@Option(doc = "Configure SortingCollections to store this many records before spilling to disk. For an indexed" +
" run, each SortingCollection gets this value/number of indices.")
public int MAX_READS_IN_RAM_PER_TILE = 1200000;
@Option(doc="The minimum quality (after transforming 0s to 1s) expected from reads. If qualities are lower than this value, an error is thrown." +
"The default of 2 is what the Illumina's spec describes as the minimum, but in practice the value has been observed lower.")
public int MINIMUM_QUALITY = BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY;
@Option(doc="Whether to include non-PF reads", shortName="NONPF", optional=true)
public boolean INCLUDE_NON_PF_READS = true;
private final Map<String, SAMFileWriterWrapper> barcodeSamWriterMap = new HashMap<String, SAMFileWriterWrapper>();
private ReadStructure readStructure;
IlluminaBasecallsConverter<SAMRecordsForCluster> basecallsConverter;
private static final Log log = Log.getInstance(IlluminaBasecallsToSam.class);
private BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
@Override
protected int doWork() {
initialize();
basecallsConverter.doTileProcessing();
return 0;
}
/**
* Prepares loggers, initiates garbage collection thread, parses arguments and initialized variables appropriately/
*/
private void initialize() {
this.bclQualityEvaluationStrategy = new BclQualityEvaluationStrategy(MINIMUM_QUALITY);
if (OUTPUT != null) {
IOUtil.assertFileIsWritable(OUTPUT);
}
if (LIBRARY_PARAMS != null) {
IOUtil.assertFileIsReadable(LIBRARY_PARAMS);
}
if (OUTPUT != null) {
barcodeSamWriterMap.put(null, buildSamFileWriter(OUTPUT, SAMPLE_ALIAS, LIBRARY_NAME, buildSamHeaderParameters(null)));
} else {
populateWritersFromLibraryParams();
}
readStructure = new ReadStructure(READ_STRUCTURE);
final int numOutputRecords = readStructure.templates.length();
basecallsConverter = new IlluminaBasecallsConverter<SAMRecordsForCluster>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure,
barcodeSamWriterMap, true, MAX_READS_IN_RAM_PER_TILE/numOutputRecords, TMP_DIR, NUM_PROCESSORS, FORCE_GC,
FIRST_TILE, TILE_LIMIT, new QueryNameComparator(), new Codec(numOutputRecords), SAMRecordsForCluster.class,
bclQualityEvaluationStrategy, this.APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS);
log.info("DONE_READING STRUCTURE IS " + readStructure.toString());
/**
* Be sure to pass the outputReadStructure to ClusterDataToSamConverter, which reflects the structure of the output cluster
* data which may be different from the input read structure (specifically if there are skips).
*/
final ClusterDataToSamConverter converter = new ClusterDataToSamConverter(RUN_BARCODE, READ_GROUP_ID,
basecallsConverter.getFactory().getOutputReadStructure(), ADAPTERS_TO_CHECK);
basecallsConverter.setConverter(converter);
}
/**
* Assert that expectedCols are present and return actualCols - expectedCols
*
* @param actualCols The columns present in the LIBRARY_PARAMS file
* @param expectedCols The columns that are REQUIRED
* @return actualCols - expectedCols
*/
private Set<String> findAndFilterExpectedColumns(final Set<String> actualCols, final Set<String> expectedCols) {
final Set<String> missingColumns = new HashSet<String>(expectedCols);
missingColumns.removeAll(actualCols);
if (missingColumns.size() > 0) {
throw new PicardException(String.format(
"LIBRARY_PARAMS file %s is missing the following columns: %s.",
LIBRARY_PARAMS.getAbsolutePath(), StringUtil.join(", ", missingColumns
)));
}
final Set<String> remainingColumns = new HashSet<String>(actualCols);
remainingColumns.removeAll(expectedCols);
return remainingColumns;
}
/**
* Given a set of columns assert that all columns conform to the format of an RG header attribute (i.e. 2 letters)
* the attribute is NOT a member of the rgHeaderTags that are built by default in buildSamHeaderParameters
*
* @param rgTagColumns A set of columns that should conform to the rg header attribute format
*/
private void checkRgTagColumns(final Set<String> rgTagColumns) {
final Set<String> forbiddenHeaders = buildSamHeaderParameters(null).keySet();
forbiddenHeaders.retainAll(rgTagColumns);
if (forbiddenHeaders.size() > 0) {
throw new PicardException("Illegal ReadGroup tags in library params(barcode params) file(" + LIBRARY_PARAMS.getAbsolutePath() + ") Offending headers = " + StringUtil.join(", ", forbiddenHeaders));
}
for (final String column : rgTagColumns) {
if (column.length() > 2) {
throw new PicardException("Column label (" + column + ") unrecognized. Library params(barcode params) can only contain the columns " +
"(OUTPUT, LIBRARY_NAME, SAMPLE_ALIAS, BARCODE, BARCODE_<X> where X is a positive integer) OR two letter RG tags!");
}
}
}
/**
* For each line in the LIBRARY_PARAMS file create a SamFileWriter and put it in the barcodeSamWriterMap map, where
* the key to the map is the concatenation of all barcodes in order for the given line
*/
private void populateWritersFromLibraryParams() {
final TabbedTextFileWithHeaderParser libraryParamsParser = new TabbedTextFileWithHeaderParser(LIBRARY_PARAMS);
final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME");
final List<String> barcodeColumnLabels = new ArrayList<String>();
if (readStructure.barcodes.length() == 1) {
//For the single barcode read case, the barcode label name can either by BARCODE or BARCODE_1
if (libraryParamsParser.hasColumn("BARCODE")) {
barcodeColumnLabels.add("BARCODE");
} else if (libraryParamsParser.hasColumn("BARCODE_1")) {
barcodeColumnLabels.add("BARCODE_1");
} else {
throw new PicardException("LIBRARY_PARAMS(BARCODE_PARAMS) file " + LIBRARY_PARAMS + " does not have column BARCODE or BARCODE_1.");
}
} else {
for (int i = 1; i <= readStructure.barcodes.length(); i++) {
barcodeColumnLabels.add("BARCODE_" + i);
}
}
expectedColumnLabels.addAll(barcodeColumnLabels);
final Set<String> rgTagColumns = findAndFilterExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels);
checkRgTagColumns(rgTagColumns);
for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) {
List<String> barcodeValues = null;
if (barcodeColumnLabels.size() > 0) {
barcodeValues = new ArrayList<String>();
for (final String barcodeLabel : barcodeColumnLabels) {
barcodeValues.add(row.getField(barcodeLabel));
}
}
final String key = (barcodeValues == null || barcodeValues.contains("N")) ? null : StringUtil.join("", barcodeValues);
if (barcodeSamWriterMap.containsKey(key)) { //This will catch the case of having more than 1 line in a non-barcoded LIBRARY_PARAMS file
throw new PicardException("Row for barcode " + key + " appears more than once in LIBRARY_PARAMS or BARCODE_PARAMS file " +
LIBRARY_PARAMS);
}
final Map<String, String> samHeaderParams = buildSamHeaderParameters(barcodeValues);
for (final String tagName : rgTagColumns) {
samHeaderParams.put(tagName, row.getField(tagName));
}
final SAMFileWriterWrapper writer = buildSamFileWriter(new File(row.getField("OUTPUT")),
row.getField("SAMPLE_ALIAS"), row.getField("LIBRARY_NAME"), samHeaderParams);
barcodeSamWriterMap.put(key, writer);
}
if (barcodeSamWriterMap.isEmpty()) {
throw new PicardException("LIBRARY_PARAMS(BARCODE_PARAMS) file " + LIBRARY_PARAMS + " does have any data rows.");
}
libraryParamsParser.close();
}
/**
* Create the list of headers that will be added to the SAMFileHeader for a library with the given barcodes (or
* the entire run if barcodes == NULL). Note that any value that is null will NOT be added via buildSamFileWriter
* but is placed in the map in order to be able to query the tags that we automatically add.
*
* @param barcodes The list of barcodes that uniquely identify the read group we are building parameters for
* @return A Map of ReadGroupHeaderTags -> Values
*/
private Map<String, String> buildSamHeaderParameters(final List<String> barcodes) {
final Map<String, String> params = new HashMap<String, String>();
String platformUnit = RUN_BARCODE + "." + LANE;
if (barcodes != null) platformUnit += ("." + IlluminaUtil.barcodeSeqsToString(barcodes));
params.put("PU", platformUnit);
params.put("CN", SEQUENCING_CENTER);
params.put("PL", PLATFORM);
if (RUN_START_DATE != null) {
final Iso8601Date date = new Iso8601Date(RUN_START_DATE);
params.put("DT", date.toString());
} else {
params.put("DT", null);
}
return params;
}
/**
* Build a SamFileWriter that will write its contents to the output file.
*
* @param output The file to which to write
* @param sampleAlias The sample alias set in the read group header
* @param libraryName The name of the library to which this read group belongs
* @param headerParameters Header parameters that will be added to the RG header for this SamFile
* @return A SAMFileWriter
*/
private SAMFileWriterWrapper buildSamFileWriter(final File output, final String sampleAlias,
final String libraryName, final Map<String, String> headerParameters) {
IOUtil.assertFileIsWritable(output);
final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID);
rg.setSample(sampleAlias);
if (libraryName != null) rg.setLibrary(libraryName);
for (final Map.Entry<String, String> tagNameToValue : headerParameters.entrySet()) {
if (tagNameToValue.getValue() != null) {
rg.setAttribute(tagNameToValue.getKey(), tagNameToValue.getValue());
}
}
final SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.queryname);
header.addReadGroup(rg);
return new SAMFileWriterWrapper(new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, output));
}
public static void main(final String[] args) {
System.exit(new IlluminaBasecallsToSam().instanceMain(args));
}
/**
* Put any custom command-line validation in an override of this method.
* clp is initialized at this point and can be used to print usage and access args.
* Any options set by command-line parser can be validated.
*
* @return null if command line is valid. If command line is invalid, returns an array of error message
* to be written to the appropriate place.
*/
@Override
protected String[] customCommandLineValidation() {
if (BARCODE_PARAMS != null) {
LIBRARY_PARAMS = BARCODE_PARAMS;
}
final ArrayList<String> messages = new ArrayList<String>();
readStructure = new ReadStructure(READ_STRUCTURE);
if (!readStructure.barcodes.isEmpty()) {
if (LIBRARY_PARAMS == null) {
messages.add("BARCODE_PARAMS or LIBRARY_PARAMS is missing. If READ_STRUCTURE contains a B (barcode)" +
" then either LIBRARY_PARAMS or BARCODE_PARAMS(deprecated) must be provided!");
}
}
if (READ_GROUP_ID == null) {
READ_GROUP_ID = RUN_BARCODE.substring(0, 5) + "." + LANE;
}
if (messages.size() == 0) {
return null;
}
return messages.toArray(new String[messages.size()]);
}
private static class SAMFileWriterWrapper
implements IlluminaBasecallsConverter.ConvertedClusterDataWriter<SAMRecordsForCluster> {
public final SAMFileWriter writer;
private SAMFileWriterWrapper(final SAMFileWriter writer) {
this.writer = writer;
}
@Override
public void write(final SAMRecordsForCluster records) {
for (final SAMRecord rec : records.records) {
writer.addAlignment(rec);
}
}
@Override
public void close() {
writer.close();
}
}
static class SAMRecordsForCluster {
final SAMRecord[] records;
SAMRecordsForCluster(final int numRecords) {
records = new SAMRecord[numRecords];
}
}
static class QueryNameComparator implements Comparator<SAMRecordsForCluster> {
private final SAMRecordQueryNameComparator comparator = new SAMRecordQueryNameComparator();
@Override
public int compare(final SAMRecordsForCluster s1, final SAMRecordsForCluster s2) {
return comparator.compare(s1.records[0], s2.records[0]);
}
}
static class Codec implements SortingCollection.Codec<SAMRecordsForCluster> {
private final BAMRecordCodec bamCodec;
private final int numRecords;
Codec(final int numRecords, final BAMRecordCodec bamCodec) {
this.numRecords = numRecords;
this.bamCodec = bamCodec;
}
Codec(final int numRecords) {
this(numRecords, new BAMRecordCodec(null));
}
@Override
public void setOutputStream(final OutputStream os) {
bamCodec.setOutputStream(os);
}
@Override
public void setInputStream(final InputStream is) {
bamCodec.setInputStream(is);
}
@Override
public void encode(final SAMRecordsForCluster val) {
if (val.records.length != numRecords) {
throw new IllegalStateException(String.format("Expected number of clusters %d != actual %d",
numRecords, val.records.length));
}
for (final SAMRecord rec : val.records) {
bamCodec.encode(rec);
}
}
@Override
public SAMRecordsForCluster decode() {
final SAMRecord zerothRecord = bamCodec.decode();
if (zerothRecord == null) return null;
final SAMRecordsForCluster ret = new SAMRecordsForCluster(numRecords);
ret.records[0] = zerothRecord;
for (int i = 1; i < numRecords; ++i) {
ret.records[i] = bamCodec.decode();
if (ret.records[i] == null) {
throw new IllegalStateException(String.format("Expected to read % records but read only %d", numRecords, i));
}
}
return ret;
}
@Override
public SortingCollection.Codec<SAMRecordsForCluster> clone() {
return new Codec(numRecords, bamCodec.clone());
}
}
}