/*
* The MIT License
*
* Copyright (c) 2013 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SortingCollection;
import picard.PicardException;
import picard.illumina.parser.ClusterData;
import picard.illumina.parser.IlluminaDataProvider;
import picard.illumina.parser.IlluminaDataProviderFactory;
import picard.illumina.parser.IlluminaDataType;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.util.FileChannelJDKBugWorkAround;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicBoolean;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
/**
* Manages the conversion of Illumina basecalls into some output format. Creates multiple threads to manage reading,
* sorting and writing efficiently. Output is written in queryname output. Optionally demultiplexes indexed reads
* into separate outputs by barcode.
*
* @param <CLUSTER_OUTPUT_RECORD> The class to which a ClusterData is converted in preparation for writing.
*/
public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
/**
* Describes the state of a barcode's data's processing in the context of a tile. It is either not available in
* that tile, has been read, has been queued to be written to file, or has been written to file. A barcode only
* takes on a state once the tile (which is serving as the context of this state) has been read.
*/
private enum TileBarcodeProcessingState {
NA, READ, QUEUED_FOR_WRITE, WRITTEN
}
/**
* Describes the state of a tile being processed. It is either not yet completely read, or read.
*/
private enum TileProcessingState {
NOT_DONE_READING, DONE_READING
}
private static final Log log = Log.getInstance(IlluminaBasecallsConverter.class);
public static final IlluminaDataType[] DATA_TYPES_NO_BARCODE =
{IlluminaDataType.BaseCalls, IlluminaDataType.QualityScores, IlluminaDataType.Position, IlluminaDataType.PF};
private static final IlluminaDataType[] DATA_TYPES_WITH_BARCODE = Arrays.copyOf(DATA_TYPES_NO_BARCODE, DATA_TYPES_NO_BARCODE.length + 1);
static {
DATA_TYPES_WITH_BARCODE[DATA_TYPES_WITH_BARCODE.length - 1] = IlluminaDataType.Barcodes;
}
/**
* A comparator for tile numbers, which are not necessarily ordered by the number's value.
*/
public static final Comparator<Integer> TILE_NUMBER_COMPARATOR = new Comparator<Integer>() {
@Override
public int compare(final Integer integer1, final Integer integer2) {
final String s1 = integer1.toString();
final String s2 = integer2.toString();
// Because a the tile number is followed by a colon, a tile number that
// is a prefix of another tile number should sort after. (e.g. 10 sorts after 100).
if (s1.length() < s2.length()) {
if (s2.startsWith(s1)) {
return 1;
}
} else if (s2.length() < s1.length()) {
if (s1.startsWith(s2)) {
return -1;
}
}
return s1.compareTo(s2);
}
};
private final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator;
private final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
private final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap;
private final int maxReadsInRamPerTile;
private final boolean demultiplex;
private final List<File> tmpDirs;
private final IlluminaDataProviderFactory factory;
private ClusterDataConverter<CLUSTER_OUTPUT_RECORD> converter = null;
private final ProgressLogger readProgressLogger = new ProgressLogger(log, 1000000, "Read");
private final ProgressLogger writeProgressLogger = new ProgressLogger(log, 1000000, "Write");
private int numThreads;
// If FORCE_GC, this is non-null. For production this is not necessary because it will run until the JVM
// ends, but for unit testing it is desirable to stop the task when done with this instance.
private final TimerTask gcTimerTask;
private List<Integer> tiles;
private final boolean includeNonPfReads;
private final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype;
// Annoying that we need this.
private final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass;
/**
* @param basecallsDir Where to read basecalls from.
* @param lane What lane to process.
* @param readStructure How to interpret each cluster.
* @param barcodeRecordWriterMap Map from barcode to CLUSTER_OUTPUT_RECORD writer. If demultiplex is false, must contain
* one writer stored with key=null.
* @param demultiplex If true, output is split by barcode, otherwise all are written to the same output stream.
* @param maxReadsInRamPerTile Configures number of reads each tile will store in RAM before spilling to disk.
* @param tmpDirs For SortingCollection spilling.
* @param numProcessors Controls number of threads. If <= 0, the number of threads allocated is
* available cores - numProcessors.
* @param forceGc Force explicit GC periodically. This is good for causing memory maps to be released.
* @param firstTile (For debugging) If non-null, start processing at this tile.
* @param tileLimit (For debugging) If non-null, process no more than this many tiles.
* @param outputRecordComparator For sorting output records within a single tile.
* @param codecPrototype For spilling output records to disk.
* @param outputRecordClass Inconveniently needed to create SortingCollections.
* @param includeNonPfReads If true, will include ALL reads (including those which do not have PF set)
*/
public IlluminaBasecallsConverter(final File basecallsDir, final int lane, final ReadStructure readStructure,
final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
final boolean demultiplex,
final int maxReadsInRamPerTile,
final List<File> tmpDirs,
final int numProcessors, final boolean forceGc,
final Integer firstTile, final Integer tileLimit,
final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
final boolean applyEamssFiltering,
final boolean includeNonPfReads
) {
this(basecallsDir, null, lane, readStructure,
barcodeRecordWriterMap, demultiplex, maxReadsInRamPerTile,
tmpDirs, numProcessors, forceGc, firstTile, tileLimit,
outputRecordComparator, codecPrototype, outputRecordClass,
bclQualityEvaluationStrategy, applyEamssFiltering,
includeNonPfReads);
}
/**
* @param basecallsDir Where to read basecalls from.
* @param barcodesDir Where to read barcodes from (optional; use basecallsDir if not specified).
* @param lane What lane to process.
* @param readStructure How to interpret each cluster.
* @param barcodeRecordWriterMap Map from barcode to CLUSTER_OUTPUT_RECORD writer. If demultiplex is false, must contain
* one writer stored with key=null.
* @param demultiplex If true, output is split by barcode, otherwise all are written to the same output stream.
* @param maxReadsInRamPerTile Configures number of reads each tile will store in RAM before spilling to disk.
* @param tmpDirs For SortingCollection spilling.
* @param numProcessors Controls number of threads. If <= 0, the number of threads allocated is
* available cores - numProcessors.
* @param forceGc Force explicit GC periodically. This is good for causing memory maps to be released.
* @param firstTile (For debugging) If non-null, start processing at this tile.
* @param tileLimit (For debugging) If non-null, process no more than this many tiles.
* @param outputRecordComparator For sorting output records within a single tile.
* @param codecPrototype For spilling output records to disk.
* @param outputRecordClass Inconveniently needed to create SortingCollections.
* @param includeNonPfReads If true, will include ALL reads (including those which do not have PF set)
*/
public IlluminaBasecallsConverter(final File basecallsDir, File barcodesDir, final int lane,
final ReadStructure readStructure,
final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
final boolean demultiplex,
final int maxReadsInRamPerTile,
final List<File> tmpDirs, final int numProcessors,
final boolean forceGc, final Integer firstTile,
final Integer tileLimit,
final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
final boolean applyEamssFiltering, final boolean includeNonPfReads
) {
this.barcodeRecordWriterMap = barcodeRecordWriterMap;
this.demultiplex = demultiplex;
this.maxReadsInRamPerTile = maxReadsInRamPerTile;
this.tmpDirs = tmpDirs;
this.outputRecordComparator = outputRecordComparator;
this.codecPrototype = codecPrototype;
this.outputRecordClass = outputRecordClass;
this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
this.includeNonPfReads = includeNonPfReads;
// If we're forcing garbage collection, collect every 5 minutes in a daemon thread.
if (forceGc) {
final Timer gcTimer = new Timer(true);
final long delay = 5 * 1000 * 60;
gcTimerTask = new TimerTask() {
@Override
public void run() {
log.info("Before explicit GC, Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory());
System.gc();
System.runFinalization();
log.info("After explicit GC, Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory());
}
};
gcTimer.scheduleAtFixedRate(gcTimerTask, delay, delay);
} else {
gcTimerTask = null;
}
this.factory = new IlluminaDataProviderFactory(basecallsDir, barcodesDir, lane, readStructure, bclQualityEvaluationStrategy, getDataTypesFromReadStructure(readStructure, demultiplex));
this.factory.setApplyEamssFiltering(applyEamssFiltering);
if (numProcessors == 0) {
this.numThreads = Runtime.getRuntime().availableProcessors();
} else if (numProcessors < 0) {
this.numThreads = Runtime.getRuntime().availableProcessors() + numProcessors;
} else {
this.numThreads = numProcessors;
}
this.tiles = new ArrayList<Integer>(factory.getAvailableTiles());
// Since the first non-fixed part of the read name is the tile number, without preceding zeroes,
// and the output is sorted by read name, process the tiles in this order.
Collections.sort(tiles, TILE_NUMBER_COMPARATOR);
if (firstTile != null) {
int i;
for (i = 0; i < tiles.size(); ++i) {
if (tiles.get(i).intValue() == firstTile.intValue()) {
tiles = tiles.subList(i, tiles.size());
break;
}
}
if (tiles.get(0).intValue() != firstTile.intValue()) {
throw new PicardException("firstTile=" + firstTile + ", but that tile was not found.");
}
}
if (tileLimit != null && tiles.size() > tileLimit) {
tiles = tiles.subList(0, tileLimit);
}
this.numThreads = Math.max(1, Math.min(this.numThreads, tiles.size()));
}
/**
* Must be called before doTileProcessing. This is not passed in the ctor because often the
* IlluminaDataProviderFactory is needed in order to construct the converter.
*
* @param converter Converts ClusterData to CLUSTER_OUTPUT_RECORD
*/
public void setConverter(final ClusterDataConverter<CLUSTER_OUTPUT_RECORD> converter) {
this.converter = converter;
}
/**
* In case caller needs to get some info from factory.
*/
public IlluminaDataProviderFactory getFactory() {
return factory;
}
/**
* Do the work, i.e. create a bunch of threads to read, sort and write.
* setConverter() must be called before calling this method.
*/
public void doTileProcessing() {
try {
// TODO: Eliminate this when switch to JDK 7
FileChannelJDKBugWorkAround.doBugWorkAround();
// Generate the list of tiles that will be processed
final List<Tile> tiles = new ArrayList<Tile>();
for (final Integer tileNumber : this.tiles) {
tiles.add(new Tile(tileNumber));
}
final TileReadAggregator tileReadAggregator = new TileReadAggregator(tiles);
tileReadAggregator.submit();
try {
tileReadAggregator.awaitWorkComplete();
} catch (final InterruptedException e) {
log.error(e, "Failure encountered in worker thread; attempting to shut down remaining worker threads and terminate ...");
throw new PicardException("Failure encountered in worker thread; see log for details.");
} finally {
tileReadAggregator.shutdown();
}
for (final Map.Entry<Byte, Integer> entry : bclQualityEvaluationStrategy.getPoorQualityFrequencies().entrySet()) {
log.warn(String.format("Observed low quality of %s %s times.", entry.getKey(), entry.getValue()));
}
bclQualityEvaluationStrategy.assertMinimumQualities();
} finally {
try {
if (gcTimerTask != null) gcTimerTask.cancel();
} catch (final Throwable ex) {
log.warn(ex, "Ignoring exception stopping background GC thread.");
}
// Close the writers
for (final Map.Entry<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> entry : barcodeRecordWriterMap.entrySet()) {
final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer = entry.getValue();
log.debug(String.format("Closing file for barcode %s.", entry.getKey()));
writer.close();
}
}
}
/**
* Simple representation of a tile
*/
private static class Tile implements Comparable<Tile> {
private final int tileNumber;
public Tile(final int i) {
tileNumber = i;
}
public int getNumber() {
return tileNumber;
}
@Override
public boolean equals(final Object o) {
return o instanceof Tile && this.getNumber() == ((Tile) o).getNumber();
}
@Override
public int compareTo(final Tile o) {
return TILE_NUMBER_COMPARATOR.compare(this.getNumber(), o.getNumber());
}
}
/**
* A Runnable that carries a priority which is used to compare and order other PriorityRunnables in a task queue.
*/
private abstract class PriorityRunnable implements Runnable {
private final int priority;
/**
* Create a new priority runnable with a default priority of 1.
*/
public PriorityRunnable() {
this(1);
}
public PriorityRunnable(final int priority) {
this.priority = priority;
}
/**
* Returns the priority level. Higher priorities are run earlier.
*
* @return
*/
int getPriority() {
return this.priority;
}
}
/**
* Represents the state of a tile's processing and encapsulates the data collected from that tile.
* <p/>
* TileProcessingRecords are accessed from each worker thread to assess the progress of the run, so its methods
* are synchronized.
*/
private class TileProcessingRecord {
final private Map<String, SortingCollection<CLUSTER_OUTPUT_RECORD>> barcodeToRecordCollection =
new HashMap<String, SortingCollection<CLUSTER_OUTPUT_RECORD>>();
final private Map<String, TileBarcodeProcessingState> barcodeToProcessingState = new HashMap<String, TileBarcodeProcessingState>();
private TileProcessingState state = TileProcessingState.NOT_DONE_READING;
private long recordCount = 0;
/**
* Returns the state of this tile's processing.
*/
public synchronized TileProcessingState getState() {
return this.state;
}
/**
* Sets the state of this tile's processing.
*/
public synchronized void setState(final TileProcessingState state) {
this.state = state;
}
/**
* Adds the provided record to this tile.
*/
public synchronized void addRecord(final String barcode, final CLUSTER_OUTPUT_RECORD record) {
this.recordCount += 1;
// Grab the existing collection, or initialize it if it doesn't yet exist
SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
if (recordCollection == null) {
if (!barcodeRecordWriterMap.containsKey(barcode))
throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected. (Is it referenced in the parameters file?)", barcode));
recordCollection = this.newSortingCollection();
this.barcodeToRecordCollection.put(barcode, recordCollection);
this.barcodeToProcessingState.put(barcode, null);
}
recordCollection.add(record);
}
private synchronized SortingCollection<CLUSTER_OUTPUT_RECORD> newSortingCollection() {
final int maxRecordsInRam =
maxReadsInRamPerTile /
barcodeRecordWriterMap.size();
return SortingCollection.newInstance(
outputRecordClass,
codecPrototype.clone(),
outputRecordComparator,
maxRecordsInRam,
tmpDirs);
}
/**
* Returns the number of unique barcodes read.
*/
public synchronized long getBarcodeCount() {
return this.barcodeToRecordCollection.size();
}
/**
* Returns the number of records read.
*/
public synchronized long getRecordCount() {
return recordCount;
}
/**
* Returns the mapping of barcodes to records associated with them.
*/
public synchronized Map<String, SortingCollection<CLUSTER_OUTPUT_RECORD>> getBarcodeRecords() {
return barcodeToRecordCollection;
}
/**
* Gets the state of the provided barcode's data's processing progress. Only invoke this query if this tile
* is in a DONE_READING state.
*
* @throws IllegalStateException When a barcode is queried before the tile is in the DONE_READING state
*/
public synchronized TileBarcodeProcessingState getBarcodeState(final String barcode) {
if (this.getState() == TileProcessingState.NOT_DONE_READING) {
throw new IllegalStateException(
"A tile's barcode data's state cannot be queried until the tile has been completely read.");
}
if (this.barcodeToProcessingState.containsKey(barcode)) {
return this.barcodeToProcessingState.get(barcode);
} else {
return TileBarcodeProcessingState.NA;
}
}
public synchronized Map<String, TileBarcodeProcessingState> getBarcodeProcessingStates() {
return this.barcodeToProcessingState;
}
/**
* Sets the processing state of the provided barcode in this record.
*
* @throws java.util.NoSuchElementException When the provided barcode is not one associated with this record.
*/
public synchronized void setBarcodeState(final String barcode, final TileBarcodeProcessingState state) {
if (this.barcodeToProcessingState.containsKey(barcode)) {
this.barcodeToProcessingState.put(barcode, state);
} else {
throw new NoSuchElementException(String.format("No record of the provided barcode, %s.", barcode));
}
}
/**
* Returns the distinct set of barcodes for which data has been collected in this record.
*
* @return
*/
public synchronized Set<String> getBarcodes() {
return this.getBarcodeRecords().keySet();
}
}
/**
* Reads the information from a tile via an IlluminaDataProvider and feeds red information into a processingRecord
* managed by the TileReadAggregator.
*/
private class TileReader {
private final Tile tile;
private final TileReadAggregator handler;
private final TileProcessingRecord processingRecord;
public TileReader(final Tile tile, final TileReadAggregator handler, final TileProcessingRecord processingRecord) {
this.tile = tile;
this.handler = handler;
this.processingRecord = processingRecord;
}
/**
* Reads the data from the appropriate IlluminaDataProvider and feeds it into the TileProcessingRecord for
* this tile.
*/
public void process() {
final IlluminaDataProvider dataProvider = factory.makeDataProvider(Arrays.asList(this.tile.getNumber()));
log.debug(String.format("Reading data from tile %s ...", tile.getNumber()));
while (dataProvider.hasNext()) {
final ClusterData cluster = dataProvider.next();
readProgressLogger.record(null, 0);
// If this cluster is passing, or we do NOT want to ONLY emit passing reads, then add it to the next
if (cluster.isPf() || includeNonPfReads) {
final String barcode = (demultiplex ? cluster.getMatchedBarcode() : null);
this.processingRecord.addRecord(barcode, converter.convertClusterToOutputRecord(cluster));
}
}
this.handler.completeTile(this.tile);
dataProvider.close();
}
}
/**
* Aggregates data collected from tiles and writes them to file. Accepts records from TileReaders and maps
* them to the appropriate BAM writers.
*/
private class TileReadAggregator {
/**
* The collection of records associated with a particular tile.
* <p/>
* Implemented as a TreeMap to guarantee tiles are iterated over in natural order.
*/
private final Map<Tile, TileProcessingRecord> tileRecords = new TreeMap<Tile, TileProcessingRecord>();
/**
* The executor responsible for doing work.
* <p/>
* Implemented as a ThreadPoolExecutor with a PriorityBlockingQueue which orders submitted Runnables by their
* priority.
*/
private final ExecutorService prioritizingThreadPool = new ThreadPoolExecutor(
numThreads,
numThreads,
0L,
MILLISECONDS,
new PriorityBlockingQueue<Runnable>(5, new Comparator<Runnable>() {
@Override
/**
* Compare the two Runnables, and assume they are PriorityRunnable; if not something strange is
* going on, so allow a ClassCastException be thrown.
*/
public int compare(final Runnable o1, final Runnable o2) {
// Higher priority items go earlier in the queue, so reverse the "natural" comparison.
return ((PriorityRunnable) o2).getPriority() - ((PriorityRunnable) o1).getPriority();
}
})
);
/**
* The object acting as a latch to notify when the aggregator completes its work.
*/
private final Object completionLatch = new Object();
/**
* Stores the thread that is executing this work so that it can be interrupted upon failure.
*/
private Thread parentThread;
private final Object workEnqueueMonitor = new Object();
private final AtomicBoolean submitted = new AtomicBoolean(false);
/**
* Creates a TileReadAggregator that reads from the provided tiles.
*
* @param tiles
*/
public TileReadAggregator(final Collection<Tile> tiles) {
for (final Tile t : tiles) {
tileRecords.put(t, new TileProcessingRecord());
}
}
/**
* Execute the tile aggregator's work. Creates a thread pool to read data from tiles and write them to file.
* Invoke this method only once.
*
* @throws IllegalStateException If submit was called more than once.
*/
public void submit() {
// Ensure the aggregator as not yet been submitted
if (!this.submitted.compareAndSet(false, true)) {
throw new IllegalStateException("The submit() method may not be called more than once.");
}
// Set the thread that is executing this work
this.parentThread = Thread.currentThread();
/**
* For each tile, create and submit a tile processor. Give it a negative execution priority (so that
* prioritized tasks with a positive execution priority execute first), and give later tiles a lesser
* (more negative) priority.
*/
int priority = 0;
for (final Tile tile : this.tileRecords.keySet()) {
final TileReader reader = new TileReader(tile, this, this.tileRecords.get(tile));
this.prioritizingThreadPool.execute(new PriorityRunnable(--priority) {
@Override
public void run() {
try {
reader.process();
} catch (final RuntimeException e) {
/**
* In the event of an internal failure, signal to the parent thread that something has gone
* wrong. This is necessary because if an item of work fails to complete, the aggregator will
* will never reach its completed state, and it will never terminate.
*/
parentThread.interrupt();
throw e;
} catch (final Error e) {
parentThread.interrupt();
throw e;
}
}
});
}
}
/**
* Signals that a tile's processing is complete. This must be invoked exactly once per tile, and only after
* all of that tile has been processed.
*
* @throws IllegalStateException When the tile is already in the completed state.
*/
private void completeTile(final Tile tile) {
final TileProcessingRecord tileRecord = this.tileRecords.get(tile);
if (tileRecord.getState() == TileProcessingState.DONE_READING) {
throw new IllegalStateException("This tile is already in the completed state.");
}
// Update all of the barcodes and the tile to be marked as read
for (final String barcode : tileRecord.getBarcodes()) {
tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.READ);
tileRecord.barcodeToRecordCollection.get(barcode).doneAdding();
}
tileRecord.setState(TileProcessingState.DONE_READING);
log.debug(String.format("Completed reading tile %s; collected %s reads spanning %s barcodes.",
tile.getNumber(), tileRecord.getRecordCount(), tileRecord.getBarcodeCount()));
//noinspection SynchronizationOnLocalVariableOrMethodParameter
this.findAndEnqueueWorkOrSignalCompletion();
}
/**
* Blocks until this aggregator completes its work.
*
* @throws InterruptedException
*/
public void awaitWorkComplete() throws InterruptedException {
synchronized (this.completionLatch) {
this.completionLatch.wait();
}
}
/**
* Signals to any thread awaiting via awaitWorkComplete() that no work remains. Called
* when this aggregator has reached its completed state.
*/
private void signalWorkComplete() {
synchronized (this.completionLatch) {
this.completionLatch.notifyAll();
}
}
/**
* Poll the aggregator to find more tasks for it to enqueue. Specifically, searches for un-written data
* read from tiles for each barcode and enqueues it for writing.
*/
private void findAndEnqueueWorkOrSignalCompletion() {
synchronized (this.workEnqueueMonitor) {
/**
* If there is work remaining to be done in this aggregator, walk through all of the barcodes and find
* tiles which have not yet written their barcode data but are in a state where they are able to.
*/
if (this.isWorkCompleted()) {
this.signalWorkComplete();
} else {
final Queue<Runnable> tasks = new LinkedList<Runnable>();
for (final String barcode : barcodeRecordWriterMap.keySet()) {
NEXT_BARCODE:
for (final Map.Entry<Tile, TileProcessingRecord> entry : this.tileRecords.entrySet()) {
final Tile tile = entry.getKey();
final TileProcessingRecord tileRecord = entry.getValue();
/**
* If this tile has not been read, we cannot write this or later tiles' barcode data;
* move to the next barcode.
*/
if (tileRecord.getState() != TileProcessingState.DONE_READING) {
break;
}
switch (tileRecord.getBarcodeState(barcode)) {
case NA:
case WRITTEN:
/**
* There is no data for this barcode for this tile, or it is already written; in
* either scenario, this barcode will not be processed further for this tile, so
* move onto the next tile as a possible candidate.
*/
continue;
case QUEUED_FOR_WRITE:
/**
* The write for this barcode is in progress for this tile, so skip to the next
* barcode.
*/
break NEXT_BARCODE;
case READ:
/**
* This barcode has been read, and all of the earlier tiles have been written
* for this barcode, so queue its writing.
*/
tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.QUEUED_FOR_WRITE);
log.debug(String.format("Enqueuing work for tile %s and barcode %s.", tile.getNumber(), barcode));
tasks.add(this.newBarcodeWorkInstance(tile, tileRecord, barcode));
break NEXT_BARCODE;
}
}
}
for (final Runnable task : tasks) {
this.prioritizingThreadPool.execute(task);
}
}
}
}
/**
* Returns a PriorityRunnable that encapsulates the work involved with writing the provided tileRecord's data
* for the given barcode to disk.
*
* @param tile The tile from which the record was read
* @param tileRecord The processing record associated with the tile
* @param barcode The barcode whose data within the tileRecord is to be written
* @return The runnable that upon invocation writes the barcode's data from the tileRecord to disk
*/
private PriorityRunnable newBarcodeWorkInstance(final Tile tile, final TileProcessingRecord tileRecord, final String barcode) {
return new PriorityRunnable() {
@Override
public void run() {
try {
final SortingCollection<CLUSTER_OUTPUT_RECORD> records = tileRecord.getBarcodeRecords().get(barcode);
final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer = barcodeRecordWriterMap.get(barcode);
log.debug(String.format("Writing records from tile %s with barcode %s ...", tile.getNumber(), barcode));
final PeekIterator<CLUSTER_OUTPUT_RECORD> it = new PeekIterator<CLUSTER_OUTPUT_RECORD>(records.iterator());
while (it.hasNext()) {
final CLUSTER_OUTPUT_RECORD rec = it.next();
/**
* PIC-330 Sometimes there are two reads with the same cluster coordinates, and thus
* the same read name. Discard both of them. This code assumes that the two first of pairs
* will come before the two second of pairs, so it isn't necessary to look ahead a different
* distance for paired end. It also assumes that for paired ends there will be duplicates
* for both ends, so there is no need to be PE-aware.
*/
if (it.hasNext()) {
final CLUSTER_OUTPUT_RECORD lookAhead = it.peek();
/* TODO: Put this in SAMFileWriter wrapper
if (!rec.getReadUnmappedFlag() || !lookAhead.getReadUnmappedFlag()) {
throw new IllegalStateException("Should not have mapped reads.");
}
*/
if (outputRecordComparator.compare(rec, lookAhead) == 0) {
it.next();
log.info("Skipping reads with identical read names: " + rec.toString());
continue;
}
}
writer.write(rec);
writeProgressLogger.record(null, 0);
}
tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.WRITTEN);
findAndEnqueueWorkOrSignalCompletion();
} catch (final RuntimeException e) {
/**
* In the event of an internal failure, signal to the parent thread that something has gone
* wrong. This is necessary because if an item of work fails to complete, the aggregator will
* will never reach its completed state, and it will never terminate.
*/
parentThread.interrupt();
throw e;
} catch (final Error e) {
parentThread.interrupt();
throw e;
}
}
};
}
/**
* Returns true if this aggregator has not completed its work. Specifically, returns false iff
* any tile's barcode data yas not yet been written.
*
* @return True if more work remains to be done, false otherwise
*/
public boolean isWorkCompleted() {
for (final Map.Entry<Tile, TileProcessingRecord> entry : this.tileRecords.entrySet()) {
final TileProcessingRecord tileProcessingRecord = entry.getValue();
if (tileProcessingRecord.getState() != TileProcessingState.DONE_READING) {
log.debug(String.format("Work is not completed because a tile isn't done being read: %s.", entry.getKey().getNumber()));
return false;
} else {
for (final Map.Entry<String, TileBarcodeProcessingState> barcodeStateEntry : tileProcessingRecord.getBarcodeProcessingStates().entrySet()) {
final TileBarcodeProcessingState barcodeProcessingState = barcodeStateEntry.getValue();
if (barcodeProcessingState != TileBarcodeProcessingState.WRITTEN) {
log.debug(String.format("Work is not completed because a tile isn't done being read: Tile %s, Barcode %s, Processing State %s.", entry.getKey().getNumber(), barcodeStateEntry.getKey(), barcodeProcessingState));
return false;
}
}
}
}
log.info("All work is complete.");
return true;
}
/**
* Terminates the threads currently exiting in the thread pool abruptly via ThreadPoolExecutor.shutdownNow().
*/
public void shutdown() {
this.prioritizingThreadPool.shutdownNow();
}
}
/**
* Given a read structure return the data types that need to be parsed for this run
*/
private static IlluminaDataType[] getDataTypesFromReadStructure(final ReadStructure readStructure,
final boolean demultiplex) {
if (readStructure.barcodes.isEmpty() || !demultiplex) {
return DATA_TYPES_NO_BARCODE;
} else {
return DATA_TYPES_WITH_BARCODE;
}
}
public static interface ClusterDataConverter<OUTPUT_RECORD> {
/**
* Creates the OUTPUT_RECORDs from the cluster
*/
public OUTPUT_RECORD convertClusterToOutputRecord(final ClusterData cluster);
}
public static interface ConvertedClusterDataWriter<OUTPUT_RECORD> {
void write(final OUTPUT_RECORD rec);
void close();
}
}