Package picard.illumina

Source Code of picard.illumina.IlluminaBasecallsConverter$ClusterDataConverter

/*
* The MIT License
*
* Copyright (c) 2013 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina;

import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SortingCollection;
import picard.PicardException;
import picard.illumina.parser.ClusterData;
import picard.illumina.parser.IlluminaDataProvider;
import picard.illumina.parser.IlluminaDataProviderFactory;
import picard.illumina.parser.IlluminaDataType;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.util.FileChannelJDKBugWorkAround;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicBoolean;

import static java.util.concurrent.TimeUnit.MILLISECONDS;

/**
* Manages the conversion of Illumina basecalls into some output format.  Creates multiple threads to manage reading,
* sorting and writing efficiently.  Output is written in queryname output.  Optionally demultiplexes indexed reads
* into separate outputs by barcode.
*
* @param <CLUSTER_OUTPUT_RECORD> The class to which a ClusterData is converted in preparation for writing.
*/
public class IlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> {
    /**
     * Describes the state of a barcode's data's processing in the context of a tile.  It is either not available in
     * that tile, has been read, has been queued to be written to file, or has been written to file.  A barcode only
     * takes on a state once the tile (which is serving as the context of this state) has been read.
     */
    private enum TileBarcodeProcessingState {
        NA, READ, QUEUED_FOR_WRITE, WRITTEN
    }

    /**
     * Describes the state of a tile being processed.  It is either not yet completely read, or read.
     */
    private enum TileProcessingState {
        NOT_DONE_READING, DONE_READING
    }

    private static final Log log = Log.getInstance(IlluminaBasecallsConverter.class);

    public static final IlluminaDataType[] DATA_TYPES_NO_BARCODE =
            {IlluminaDataType.BaseCalls, IlluminaDataType.QualityScores, IlluminaDataType.Position, IlluminaDataType.PF};
    private static final IlluminaDataType[] DATA_TYPES_WITH_BARCODE = Arrays.copyOf(DATA_TYPES_NO_BARCODE, DATA_TYPES_NO_BARCODE.length + 1);

    static {
        DATA_TYPES_WITH_BARCODE[DATA_TYPES_WITH_BARCODE.length - 1] = IlluminaDataType.Barcodes;
    }

    /**
     * A comparator for tile numbers, which are not necessarily ordered by the number's value.
     */
    public static final Comparator<Integer> TILE_NUMBER_COMPARATOR = new Comparator<Integer>() {
        @Override
        public int compare(final Integer integer1, final Integer integer2) {
            final String s1 = integer1.toString();
            final String s2 = integer2.toString();
            // Because a the tile number is followed by a colon, a tile number that
            // is a prefix of another tile number should sort after. (e.g. 10 sorts after 100).
            if (s1.length() < s2.length()) {
                if (s2.startsWith(s1)) {
                    return 1;
                }
            } else if (s2.length() < s1.length()) {
                if (s1.startsWith(s2)) {
                    return -1;
                }
            }
            return s1.compareTo(s2);
        }
    };

    private final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator;

    private final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
    private final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap;
    private final int maxReadsInRamPerTile;
    private final boolean demultiplex;
    private final List<File> tmpDirs;
    private final IlluminaDataProviderFactory factory;
    private ClusterDataConverter<CLUSTER_OUTPUT_RECORD> converter = null;
    private final ProgressLogger readProgressLogger = new ProgressLogger(log, 1000000, "Read");
    private final ProgressLogger writeProgressLogger = new ProgressLogger(log, 1000000, "Write");
    private int numThreads;
    // If FORCE_GC, this is non-null.  For production this is not necessary because it will run until the JVM
    // ends, but for unit testing it is desirable to stop the task when done with this instance.
    private final TimerTask gcTimerTask;
    private List<Integer> tiles;
    private final boolean includeNonPfReads;
    private final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype;
    // Annoying that we need this.
    private final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass;

    /**
   * @param basecallsDir           Where to read basecalls from.
   * @param lane                   What lane to process.
   * @param readStructure          How to interpret each cluster.
   * @param barcodeRecordWriterMap Map from barcode to CLUSTER_OUTPUT_RECORD writer.  If demultiplex is false, must contain
   *                               one writer stored with key=null.
   * @param demultiplex            If true, output is split by barcode, otherwise all are written to the same output stream.
   * @param maxReadsInRamPerTile   Configures number of reads each tile will store in RAM before spilling to disk.
   * @param tmpDirs                For SortingCollection spilling.
   * @param numProcessors          Controls number of threads.  If <= 0, the number of threads allocated is
   *                               available cores - numProcessors.
   * @param forceGc                Force explicit GC periodically.  This is good for causing memory maps to be released.
   * @param firstTile              (For debugging) If non-null, start processing at this tile.
   * @param tileLimit              (For debugging) If non-null, process no more than this many tiles.
   * @param outputRecordComparator For sorting output records within a single tile.
   * @param codecPrototype         For spilling output records to disk.
   * @param outputRecordClass      Inconveniently needed to create SortingCollections.
   * @param includeNonPfReads      If true, will include ALL reads (including those which do not have PF set)
   */
  public IlluminaBasecallsConverter(final File basecallsDir, final int lane, final ReadStructure readStructure,
                                    final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
                                    final boolean demultiplex,
                                    final int maxReadsInRamPerTile,
                                    final List<File> tmpDirs,
                                    final int numProcessors, final boolean forceGc,
                                    final Integer firstTile, final Integer tileLimit,
                                    final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
                                    final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
                                    final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
                                    final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
                                    final boolean applyEamssFiltering,
                                    final boolean includeNonPfReads
  ) {
    this(basecallsDir, null, lane, readStructure,
        barcodeRecordWriterMap, demultiplex, maxReadsInRamPerTile,
        tmpDirs, numProcessors, forceGc, firstTile, tileLimit,
        outputRecordComparator, codecPrototype, outputRecordClass,
        bclQualityEvaluationStrategy, applyEamssFiltering,
        includeNonPfReads);
  }

  /**
     * @param basecallsDir           Where to read basecalls from.
     * @param barcodesDir            Where to read barcodes from (optional; use basecallsDir if not specified).
     * @param lane                   What lane to process.
     * @param readStructure          How to interpret each cluster.
     * @param barcodeRecordWriterMap Map from barcode to CLUSTER_OUTPUT_RECORD writer.  If demultiplex is false, must contain
     *                               one writer stored with key=null.
     * @param demultiplex            If true, output is split by barcode, otherwise all are written to the same output stream.
     * @param maxReadsInRamPerTile   Configures number of reads each tile will store in RAM before spilling to disk.
     * @param tmpDirs                For SortingCollection spilling.
     * @param numProcessors          Controls number of threads.  If <= 0, the number of threads allocated is
     *                               available cores - numProcessors.
     * @param forceGc                Force explicit GC periodically.  This is good for causing memory maps to be released.
     * @param firstTile              (For debugging) If non-null, start processing at this tile.
     * @param tileLimit              (For debugging) If non-null, process no more than this many tiles.
     * @param outputRecordComparator For sorting output records within a single tile.
     * @param codecPrototype         For spilling output records to disk.
     * @param outputRecordClass      Inconveniently needed to create SortingCollections.
     * @param includeNonPfReads      If true, will include ALL reads (including those which do not have PF set)
     */
    public IlluminaBasecallsConverter(final File basecallsDir, File barcodesDir, final int lane,
                                      final ReadStructure readStructure,
                                      final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
                                      final boolean demultiplex,
                                      final int maxReadsInRamPerTile,
                                      final List<File> tmpDirs, final int numProcessors,
                                      final boolean forceGc, final Integer firstTile,
                                      final Integer tileLimit,
                                      final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
                                      final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
                                      final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
                                      final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
                                      final boolean applyEamssFiltering, final boolean includeNonPfReads
    ) {
        this.barcodeRecordWriterMap = barcodeRecordWriterMap;
        this.demultiplex = demultiplex;
        this.maxReadsInRamPerTile = maxReadsInRamPerTile;
        this.tmpDirs = tmpDirs;
        this.outputRecordComparator = outputRecordComparator;
        this.codecPrototype = codecPrototype;
        this.outputRecordClass = outputRecordClass;
        this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
        this.includeNonPfReads = includeNonPfReads;

        // If we're forcing garbage collection, collect every 5 minutes in a daemon thread.
        if (forceGc) {
            final Timer gcTimer = new Timer(true);
            final long delay = 5 * 1000 * 60;
            gcTimerTask = new TimerTask() {
                @Override
                public void run() {
                    log.info("Before explicit GC, Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory());
                    System.gc();
                    System.runFinalization();
                    log.info("After explicit GC, Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory());
                }
            };
            gcTimer.scheduleAtFixedRate(gcTimerTask, delay, delay);
        } else {
            gcTimerTask = null;
        }

        this.factory = new IlluminaDataProviderFactory(basecallsDir, barcodesDir, lane, readStructure, bclQualityEvaluationStrategy, getDataTypesFromReadStructure(readStructure, demultiplex));
        this.factory.setApplyEamssFiltering(applyEamssFiltering);

        if (numProcessors == 0) {
            this.numThreads = Runtime.getRuntime().availableProcessors();
        } else if (numProcessors < 0) {
            this.numThreads = Runtime.getRuntime().availableProcessors() + numProcessors;
        } else {
            this.numThreads = numProcessors;
        }
        this.tiles = new ArrayList<Integer>(factory.getAvailableTiles());
        // Since the first non-fixed part of the read name is the tile number, without preceding zeroes,
        // and the output is sorted by read name, process the tiles in this order.
        Collections.sort(tiles, TILE_NUMBER_COMPARATOR);
        if (firstTile != null) {
            int i;
            for (i = 0; i < tiles.size(); ++i) {
                if (tiles.get(i).intValue() == firstTile.intValue()) {
                    tiles = tiles.subList(i, tiles.size());
                    break;
                }
            }
            if (tiles.get(0).intValue() != firstTile.intValue()) {
                throw new PicardException("firstTile=" + firstTile + ", but that tile was not found.");
            }
        }
        if (tileLimit != null && tiles.size() > tileLimit) {
            tiles = tiles.subList(0, tileLimit);
        }

        this.numThreads = Math.max(1, Math.min(this.numThreads, tiles.size()));
    }

    /**
     * Must be called before doTileProcessing.  This is not passed in the ctor because often the
     * IlluminaDataProviderFactory is needed in order to construct the converter.
     *
     * @param converter Converts ClusterData to CLUSTER_OUTPUT_RECORD
     */
    public void setConverter(final ClusterDataConverter<CLUSTER_OUTPUT_RECORD> converter) {
        this.converter = converter;
    }

    /**
     * In case caller needs to get some info from factory.
     */
    public IlluminaDataProviderFactory getFactory() {
        return factory;
    }

    /**
     * Do the work, i.e. create a bunch of threads to read, sort and write.
     * setConverter() must be called before calling this method.
     */
    public void doTileProcessing() {
        try {
            // TODO: Eliminate this when switch to JDK 7
            FileChannelJDKBugWorkAround.doBugWorkAround();

            // Generate the list of tiles that will be processed
            final List<Tile> tiles = new ArrayList<Tile>();
            for (final Integer tileNumber : this.tiles) {
                tiles.add(new Tile(tileNumber));
            }

            final TileReadAggregator tileReadAggregator = new TileReadAggregator(tiles);
            tileReadAggregator.submit();
            try {
                tileReadAggregator.awaitWorkComplete();
            } catch (final InterruptedException e) {
                log.error(e, "Failure encountered in worker thread; attempting to shut down remaining worker threads and terminate ...");
                throw new PicardException("Failure encountered in worker thread; see log for details.");
            } finally {
                tileReadAggregator.shutdown();
            }

            for (final Map.Entry<Byte, Integer> entry : bclQualityEvaluationStrategy.getPoorQualityFrequencies().entrySet()) {
                log.warn(String.format("Observed low quality of %s %s times.", entry.getKey(), entry.getValue()));
            }
            bclQualityEvaluationStrategy.assertMinimumQualities();

        } finally {
            try {
              if (gcTimerTask != null) gcTimerTask.cancel();
            } catch (final Throwable ex) {
                log.warn(ex, "Ignoring exception stopping background GC thread.");
            }
            // Close the writers
            for (final Map.Entry<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> entry : barcodeRecordWriterMap.entrySet()) {
                final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer = entry.getValue();
                log.debug(String.format("Closing file for barcode %s.", entry.getKey()));
                writer.close();
            }
        }
    }

    /**
     * Simple representation of a tile
     */
    private static class Tile implements Comparable<Tile> {
        private final int tileNumber;

        public Tile(final int i) {
            tileNumber = i;
        }

        public int getNumber() {
            return tileNumber;
        }

        @Override
        public boolean equals(final Object o) {
            return o instanceof Tile && this.getNumber() == ((Tile) o).getNumber();
        }

        @Override
        public int compareTo(final Tile o) {
            return TILE_NUMBER_COMPARATOR.compare(this.getNumber(), o.getNumber());
        }
    }


    /**
     * A Runnable that carries a priority which is used to compare and order other PriorityRunnables in a task queue.
     */
    private abstract class PriorityRunnable implements Runnable {
        private final int priority;

        /**
         * Create a new priority runnable with a default priority of 1.
         */
        public PriorityRunnable() {
            this(1);
        }

        public PriorityRunnable(final int priority) {
            this.priority = priority;
        }

        /**
         * Returns the priority level.  Higher priorities are run earlier.
         *
         * @return
         */
        int getPriority() {
            return this.priority;
        }
    }


    /**
     * Represents the state of a tile's processing and encapsulates the data collected from that tile.
     * <p/>
     * TileProcessingRecords are accessed from each worker thread to assess the progress of the run, so its methods
     * are synchronized.
     */
    private class TileProcessingRecord {
        final private Map<String, SortingCollection<CLUSTER_OUTPUT_RECORD>> barcodeToRecordCollection =
                new HashMap<String, SortingCollection<CLUSTER_OUTPUT_RECORD>>();
        final private Map<String, TileBarcodeProcessingState> barcodeToProcessingState = new HashMap<String, TileBarcodeProcessingState>();
        private TileProcessingState state = TileProcessingState.NOT_DONE_READING;
        private long recordCount = 0;

        /**
         * Returns the state of this tile's processing.
         */
        public synchronized TileProcessingState getState() {
            return this.state;
        }

        /**
         * Sets the state of this tile's processing.
         */
        public synchronized void setState(final TileProcessingState state) {
            this.state = state;
        }

        /**
         * Adds the provided record to this tile.
         */
        public synchronized void addRecord(final String barcode, final CLUSTER_OUTPUT_RECORD record) {
            this.recordCount += 1;

            // Grab the existing collection, or initialize it if it doesn't yet exist
            SortingCollection<CLUSTER_OUTPUT_RECORD> recordCollection = this.barcodeToRecordCollection.get(barcode);
            if (recordCollection == null) {
                if (!barcodeRecordWriterMap.containsKey(barcode))
                    throw new PicardException(String.format("Read records with barcode %s, but this barcode was not expected.  (Is it referenced in the parameters file?)", barcode));
                recordCollection = this.newSortingCollection();
                this.barcodeToRecordCollection.put(barcode, recordCollection);
                this.barcodeToProcessingState.put(barcode, null);
            }
            recordCollection.add(record);
        }

        private synchronized SortingCollection<CLUSTER_OUTPUT_RECORD> newSortingCollection() {
            final int maxRecordsInRam =
                    maxReadsInRamPerTile /
                            barcodeRecordWriterMap.size();
            return SortingCollection.newInstance(
                    outputRecordClass,
                    codecPrototype.clone(),
                    outputRecordComparator,
                    maxRecordsInRam,
                    tmpDirs);
        }

        /**
         * Returns the number of unique barcodes read.
         */
        public synchronized long getBarcodeCount() {
            return this.barcodeToRecordCollection.size();
        }

        /**
         * Returns the number of records read.
         */
        public synchronized long getRecordCount() {
            return recordCount;
        }

        /**
         * Returns the mapping of barcodes to records associated with them.
         */
        public synchronized Map<String, SortingCollection<CLUSTER_OUTPUT_RECORD>> getBarcodeRecords() {
            return barcodeToRecordCollection;
        }

        /**
         * Gets the state of the provided barcode's data's processing progress.  Only invoke this query if this tile
         * is in a DONE_READING state.
         *
         * @throws IllegalStateException When a barcode is queried before the tile is in the DONE_READING state
         */
        public synchronized TileBarcodeProcessingState getBarcodeState(final String barcode) {
            if (this.getState() == TileProcessingState.NOT_DONE_READING) {
                throw new IllegalStateException(
                        "A tile's barcode data's state cannot be queried until the tile has been completely read.");
            }

            if (this.barcodeToProcessingState.containsKey(barcode)) {
                return this.barcodeToProcessingState.get(barcode);
            } else {
                return TileBarcodeProcessingState.NA;
            }
        }

        public synchronized Map<String, TileBarcodeProcessingState> getBarcodeProcessingStates() {
            return this.barcodeToProcessingState;
        }

        /**
         * Sets the processing state of the provided barcode in this record.
         *
         * @throws java.util.NoSuchElementException When the provided barcode is not one associated with this record.
         */
        public synchronized void setBarcodeState(final String barcode, final TileBarcodeProcessingState state) {
            if (this.barcodeToProcessingState.containsKey(barcode)) {
                this.barcodeToProcessingState.put(barcode, state);
            } else {
                throw new NoSuchElementException(String.format("No record of the provided barcode, %s.", barcode));
            }
        }

        /**
         * Returns the distinct set of barcodes for which data has been collected in this record.
         *
         * @return
         */
        public synchronized Set<String> getBarcodes() {
            return this.getBarcodeRecords().keySet();
        }
    }

    /**
     * Reads the information from a tile via an IlluminaDataProvider and feeds red information into a processingRecord
     * managed by the TileReadAggregator.
     */
    private class TileReader {
        private final Tile tile;
        private final TileReadAggregator handler;
        private final TileProcessingRecord processingRecord;

        public TileReader(final Tile tile, final TileReadAggregator handler, final TileProcessingRecord processingRecord) {
            this.tile = tile;
            this.handler = handler;
            this.processingRecord = processingRecord;
        }

        /**
         * Reads the data from the appropriate IlluminaDataProvider and feeds it into the TileProcessingRecord for
         * this tile.
         */
        public void process() {
            final IlluminaDataProvider dataProvider = factory.makeDataProvider(Arrays.asList(this.tile.getNumber()));
            log.debug(String.format("Reading data from tile %s ...", tile.getNumber()));

            while (dataProvider.hasNext()) {
                final ClusterData cluster = dataProvider.next();
                readProgressLogger.record(null, 0);
                // If this cluster is passing, or we do NOT want to ONLY emit passing reads, then add it to the next
                if (cluster.isPf() || includeNonPfReads) {
                    final String barcode = (demultiplex ? cluster.getMatchedBarcode() : null);
                    this.processingRecord.addRecord(barcode, converter.convertClusterToOutputRecord(cluster));
                }
            }

            this.handler.completeTile(this.tile);
            dataProvider.close();
        }
    }


    /**
     * Aggregates data collected from tiles and writes them to file. Accepts records from TileReaders and maps
     * them to the appropriate BAM writers.
     */
    private class TileReadAggregator {
        /**
         * The collection of records associated with a particular tile.
         * <p/>
         * Implemented as a TreeMap to guarantee tiles are iterated over in natural order.
         */
        private final Map<Tile, TileProcessingRecord> tileRecords = new TreeMap<Tile, TileProcessingRecord>();

        /**
         * The executor responsible for doing work.
         * <p/>
         * Implemented as a ThreadPoolExecutor with a PriorityBlockingQueue which orders submitted Runnables by their
         * priority.
         */
        private final ExecutorService prioritizingThreadPool = new ThreadPoolExecutor(
                numThreads,
                numThreads,
                0L,
                MILLISECONDS,
                new PriorityBlockingQueue<Runnable>(5, new Comparator<Runnable>() {
                    @Override
                    /**
                     * Compare the two Runnables, and assume they are PriorityRunnable; if not something strange is
                     * going on, so allow a ClassCastException be thrown.
                     */
                    public int compare(final Runnable o1, final Runnable o2) {
                        // Higher priority items go earlier in the queue, so reverse the "natural" comparison.
                        return ((PriorityRunnable) o2).getPriority() - ((PriorityRunnable) o1).getPriority();
                    }
                })
        );

        /**
         * The object acting as a latch to notify when the aggregator completes its work.
         */
        private final Object completionLatch = new Object();

        /**
         * Stores the thread that is executing this work so that it can be interrupted upon failure.
         */
        private Thread parentThread;
        private final Object workEnqueueMonitor = new Object();
        private final AtomicBoolean submitted = new AtomicBoolean(false);


        /**
         * Creates a TileReadAggregator that reads from the provided tiles.
         *
         * @param tiles
         */
        public TileReadAggregator(final Collection<Tile> tiles) {
            for (final Tile t : tiles) {
                tileRecords.put(t, new TileProcessingRecord());
            }
        }

        /**
         * Execute the tile aggregator's work.  Creates a thread pool to read data from tiles and write them to file.
         * Invoke this method only once.
         *
         * @throws IllegalStateException If submit was called more than once.
         */
        public void submit() {
            // Ensure the aggregator as not yet been submitted
            if (!this.submitted.compareAndSet(false, true)) {
                throw new IllegalStateException("The submit() method may not be called more than once.");
            }

            // Set the thread that is executing this work
            this.parentThread = Thread.currentThread();

            /**
             * For each tile, create and submit a tile processor.  Give it a negative execution priority (so that
             * prioritized tasks with a positive execution priority execute first), and give later tiles a lesser
             * (more negative) priority.
             */
            int priority = 0;
            for (final Tile tile : this.tileRecords.keySet()) {
                final TileReader reader = new TileReader(tile, this, this.tileRecords.get(tile));
                this.prioritizingThreadPool.execute(new PriorityRunnable(--priority) {
                    @Override
                    public void run() {
                        try {
                            reader.process();
                        } catch (final RuntimeException e) {
                            /**
                             * In the event of an internal failure, signal to the parent thread that something has gone
                             * wrong.  This is necessary because if an item of work fails to complete, the aggregator will
                             * will never reach its completed state, and it will never terminate.
                             */
                            parentThread.interrupt();
                            throw e;
                        } catch (final Error e) {
                            parentThread.interrupt();
                            throw e;
                        }
                    }
                });
            }
        }

        /**
         * Signals that a tile's processing is complete.  This must be invoked exactly once per tile, and only after
         * all of that tile has been processed.
         *
         * @throws IllegalStateException When the tile is already in the completed state.
         */
        private void completeTile(final Tile tile) {
            final TileProcessingRecord tileRecord = this.tileRecords.get(tile);

            if (tileRecord.getState() == TileProcessingState.DONE_READING) {
                throw new IllegalStateException("This tile is already in the completed state.");
            }

            // Update all of the barcodes and the tile to be marked as read
            for (final String barcode : tileRecord.getBarcodes()) {
                tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.READ);
                tileRecord.barcodeToRecordCollection.get(barcode).doneAdding();
            }
            tileRecord.setState(TileProcessingState.DONE_READING);

            log.debug(String.format("Completed reading tile %s; collected %s reads spanning %s barcodes.",
                    tile.getNumber(), tileRecord.getRecordCount(), tileRecord.getBarcodeCount()));

            //noinspection SynchronizationOnLocalVariableOrMethodParameter
            this.findAndEnqueueWorkOrSignalCompletion();
        }

        /**
         * Blocks until this aggregator completes its work.
         *
         * @throws InterruptedException
         */
        public void awaitWorkComplete() throws InterruptedException {
            synchronized (this.completionLatch) {
                this.completionLatch.wait();
            }
        }

        /**
         * Signals to any thread awaiting via awaitWorkComplete() that no work remains. Called
         * when this aggregator has reached its completed state.
         */
        private void signalWorkComplete() {
            synchronized (this.completionLatch) {
                this.completionLatch.notifyAll();
            }
        }

        /**
         * Poll the aggregator to find more tasks for it to enqueue.  Specifically, searches for un-written data
         * read from tiles for each barcode and enqueues it for writing.
         */
        private void findAndEnqueueWorkOrSignalCompletion() {
            synchronized (this.workEnqueueMonitor) {
                /**
                 * If there is work remaining to be done in this aggregator, walk through all of the barcodes and find
                 * tiles which have not yet written their barcode data but are in a state where they are able to.
                 */
                if (this.isWorkCompleted()) {
                    this.signalWorkComplete();
                } else {
                    final Queue<Runnable> tasks = new LinkedList<Runnable>();
                    for (final String barcode : barcodeRecordWriterMap.keySet()) {
                        NEXT_BARCODE:
                        for (final Map.Entry<Tile, TileProcessingRecord> entry : this.tileRecords.entrySet()) {
                            final Tile tile = entry.getKey();
                            final TileProcessingRecord tileRecord = entry.getValue();

                            /**
                             * If this tile has not been read, we cannot write this or later tiles' barcode data;
                             * move to the next barcode.
                             */
                            if (tileRecord.getState() != TileProcessingState.DONE_READING) {
                                break;
                            }
                            switch (tileRecord.getBarcodeState(barcode)) {
                                case NA:
                                case WRITTEN:
                                    /**
                                     * There is no data for this barcode for this tile, or it is already written; in
                                     * either scenario, this barcode will not be processed further for this tile, so
                                     * move onto the next tile as a possible candidate.
                                     */
                                    continue;
                                case QUEUED_FOR_WRITE:
                                    /**
                                     * The write for this barcode is in progress for this tile, so skip to the next
                                     * barcode.
                                     */
                                    break NEXT_BARCODE;
                                case READ:
                                    /**
                                     * This barcode has been read, and all of the earlier tiles have been written
                                     * for this barcode, so queue its writing.
                                     */
                                    tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.QUEUED_FOR_WRITE);
                                    log.debug(String.format("Enqueuing work for tile %s and barcode %s.", tile.getNumber(), barcode));
                                    tasks.add(this.newBarcodeWorkInstance(tile, tileRecord, barcode));
                                    break NEXT_BARCODE;
                            }
                        }
                    }

                    for (final Runnable task : tasks) {
                        this.prioritizingThreadPool.execute(task);
                    }
                }
            }
        }

        /**
         * Returns a PriorityRunnable that encapsulates the work involved with writing the provided tileRecord's data
         * for the given barcode to disk.
         *
         * @param tile       The tile from which the record was read
         * @param tileRecord The processing record associated with the tile
         * @param barcode    The barcode whose data within the tileRecord is to be written
         * @return The runnable that upon invocation writes the barcode's data from the tileRecord to disk
         */
        private PriorityRunnable newBarcodeWorkInstance(final Tile tile, final TileProcessingRecord tileRecord, final String barcode) {
            return new PriorityRunnable() {
                @Override
                public void run() {
                    try {
                        final SortingCollection<CLUSTER_OUTPUT_RECORD> records = tileRecord.getBarcodeRecords().get(barcode);
                        final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer = barcodeRecordWriterMap.get(barcode);

                        log.debug(String.format("Writing records from tile %s with barcode %s ...", tile.getNumber(), barcode));

                        final PeekIterator<CLUSTER_OUTPUT_RECORD> it = new PeekIterator<CLUSTER_OUTPUT_RECORD>(records.iterator());
                        while (it.hasNext()) {
                            final CLUSTER_OUTPUT_RECORD rec = it.next();

                            /**
                             * PIC-330 Sometimes there are two reads with the same cluster coordinates, and thus
                             * the same read name.  Discard both of them.  This code assumes that the two first of pairs
                             * will come before the two second of pairs, so it isn't necessary to look ahead a different
                             * distance for paired end.  It also assumes that for paired ends there will be duplicates
                             * for both ends, so there is no need to be PE-aware.
                             */
                            if (it.hasNext()) {
                                final CLUSTER_OUTPUT_RECORD lookAhead = it.peek();

/* TODO: Put this in SAMFileWriter wrapper
                                if (!rec.getReadUnmappedFlag() || !lookAhead.getReadUnmappedFlag()) {
                                    throw new IllegalStateException("Should not have mapped reads.");
                                }
*/

                                if (outputRecordComparator.compare(rec, lookAhead) == 0) {
                                    it.next();
                                    log.info("Skipping reads with identical read names: " + rec.toString());
                                    continue;
                                }
                            }

                            writer.write(rec);
                            writeProgressLogger.record(null, 0);
                        }

                        tileRecord.setBarcodeState(barcode, TileBarcodeProcessingState.WRITTEN);
                        findAndEnqueueWorkOrSignalCompletion();

                    } catch (final RuntimeException e) {
                        /**
                         * In the event of an internal failure, signal to the parent thread that something has gone
                         * wrong.  This is necessary because if an item of work fails to complete, the aggregator will
                         * will never reach its completed state, and it will never terminate.
                         */
                        parentThread.interrupt();
                        throw e;
                    } catch (final Error e) {
                        parentThread.interrupt();
                        throw e;
                    }
                }

            };
        }

        /**
         * Returns true if this aggregator has not completed its work.  Specifically, returns false iff
         * any tile's barcode data yas not yet been written.
         *
         * @return True if more work remains to be done, false otherwise
         */
        public boolean isWorkCompleted() {
            for (final Map.Entry<Tile, TileProcessingRecord> entry : this.tileRecords.entrySet()) {
                final TileProcessingRecord tileProcessingRecord = entry.getValue();

                if (tileProcessingRecord.getState() != TileProcessingState.DONE_READING) {
                    log.debug(String.format("Work is not completed because a tile isn't done being read: %s.", entry.getKey().getNumber()));
                    return false;
                } else {
                    for (final Map.Entry<String, TileBarcodeProcessingState> barcodeStateEntry : tileProcessingRecord.getBarcodeProcessingStates().entrySet()) {
                        final TileBarcodeProcessingState barcodeProcessingState = barcodeStateEntry.getValue();
                        if (barcodeProcessingState != TileBarcodeProcessingState.WRITTEN) {
                            log.debug(String.format("Work is not completed because a tile isn't done being read: Tile %s, Barcode %s, Processing State %s.", entry.getKey().getNumber(), barcodeStateEntry.getKey(), barcodeProcessingState));
                            return false;
                        }
                    }
                }
            }
            log.info("All work is complete.");
            return true;
        }

        /**
         * Terminates the threads currently exiting in the thread pool abruptly via ThreadPoolExecutor.shutdownNow().
         */
        public void shutdown() {
            this.prioritizingThreadPool.shutdownNow();
        }
    }

    /**
     * Given a read structure return the data types that need to be parsed for this run
     */
    private static IlluminaDataType[] getDataTypesFromReadStructure(final ReadStructure readStructure,
                                                                    final boolean demultiplex) {
        if (readStructure.barcodes.isEmpty() || !demultiplex) {
            return DATA_TYPES_NO_BARCODE;
        } else {
            return DATA_TYPES_WITH_BARCODE;
        }
    }

    public static interface ClusterDataConverter<OUTPUT_RECORD> {

        /**
         * Creates the OUTPUT_RECORDs from the cluster
         */
        public OUTPUT_RECORD convertClusterToOutputRecord(final ClusterData cluster);
    }

    public static interface ConvertedClusterDataWriter<OUTPUT_RECORD> {
        void write(final OUTPUT_RECORD rec);

        void close();
    }
}
TOP

Related Classes of picard.illumina.IlluminaBasecallsConverter$ClusterDataConverter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.