Source Code of picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram$SamHeaderAndIterator

/*
 * The MIT License
 *
 * Copyright (c) 2014 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */


package picard.sam.markduplicates.util;


import picard.PicardException;
    import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.*;
import htsjdk.samtools.util.CloseableIterator;
import picard.cmdline.programgroups.SamOrBam;
import picard.sam.DuplicationMetrics;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;


import java.io.File;
import java.util.*;


/**
 * Abstract class that holds parameters and methods common to classes that perform duplicate
 * detection and/or marking within SAM/BAM files.
 *
 * @author Nils Homer
 */
public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractOpticalDuplicateFinderCommandLineProgram {


    @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
            doc = "One or more input SAM or BAM files to analyze. Must be coordinate sorted.")
    public List<File> INPUT;


    @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
            doc = "The output file to write marked records to")
    public File OUTPUT;


    @Option(shortName = "M",
            doc = "File to write duplication metrics to")
    public File METRICS_FILE;


    @Option(shortName = StandardOptionDefinitions.PROGRAM_RECORD_ID_SHORT_NAME,
            doc = "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
                    "PG record creation.  This string may have a suffix appended to avoid collision with other " +
                    "program record IDs.",
            optional = true)
    public String PROGRAM_RECORD_ID = "MarkDuplicates";


    @Option(shortName = "PG_VERSION",
            doc = "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.",
            optional = true)
    public String PROGRAM_GROUP_VERSION;


    @Option(shortName = "PG_COMMAND",
            doc = "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.",
            optional = true)
    public String PROGRAM_GROUP_COMMAND_LINE;


    @Option(shortName = "PG_NAME",
            doc = "Value of PN tag of PG record to be created.")
    public String PROGRAM_GROUP_NAME = getClass().getSimpleName();


    @Option(shortName = "CO",
            doc = "Comment(s) to include in the output file's header.",
            optional = true)
    public List<String> COMMENT = new ArrayList<String>();


    @Option(doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.")
    public boolean REMOVE_DUPLICATES = false;


    @Option(shortName = StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME,
            doc = "If true, assume that the input file is coordinate sorted even if the header says otherwise.")
    public boolean ASSUME_SORTED = false;


    @Option(shortName = "DS", doc = "The scoring strategy for choosing the non-duplicate among candidates.")
    public ScoringStrategy DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH;


    /** The program groups that have been seen during the course of examining the input records. */
    protected final Set<String> pgIdsSeen = new HashSet<String>();


    /**
     * We have to re-chain the program groups based on this algorithm.  This returns the map from existing program group ID
     * to new program group ID.
     */
    protected Map<String, String> getChainedPgIds(final SAMFileHeader outputHeader) {
        final Map<String, String> chainedPgIds;
        // Generate new PG record(s)
        if (PROGRAM_RECORD_ID != null) {
            final PgIdGenerator pgIdGenerator = new PgIdGenerator(outputHeader);
            if (PROGRAM_GROUP_VERSION == null) {
                PROGRAM_GROUP_VERSION = this.getVersion();
            }
            if (PROGRAM_GROUP_COMMAND_LINE == null) {
                PROGRAM_GROUP_COMMAND_LINE = this.getCommandLine();
            }
            chainedPgIds = new HashMap<String, String>();
            for (final String existingId : this.pgIdsSeen) {
                final String newPgId = pgIdGenerator.getNonCollidingId(PROGRAM_RECORD_ID);
                chainedPgIds.put(existingId, newPgId);
                final SAMProgramRecord programRecord = new SAMProgramRecord(newPgId);
                programRecord.setProgramVersion(PROGRAM_GROUP_VERSION);
                programRecord.setCommandLine(PROGRAM_GROUP_COMMAND_LINE);
                programRecord.setProgramName(PROGRAM_GROUP_NAME);
                programRecord.setPreviousProgramGroupId(existingId);
                outputHeader.addProgramRecord(programRecord);
            }
        } else {
            chainedPgIds = null;
        }
        return chainedPgIds;
    }


    /**
     * Writes the metrics given by the libraryIdGenerator to the METRICS_FILE.
     *
     * @param libraryIdGenerator
     */
    protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerator) {
        final Map<String, DuplicationMetrics> metricsByLibrary = libraryIdGenerator.getMetricsByLibraryMap();
        final Histogram<Short> opticalDuplicatesByLibraryId = libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap();
        final Map<String, Short> libraryIds = libraryIdGenerator.getLibraryIdsMap();


        // Write out the metrics
        final MetricsFile<DuplicationMetrics, Double> file = getMetricsFile();
        for (final Map.Entry<String, DuplicationMetrics> entry : metricsByLibrary.entrySet()) {
            final String libraryName = entry.getKey();
            final DuplicationMetrics metrics = entry.getValue();


            metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
            metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;


            // Add the optical dupes to the metrics
            final Short libraryId = libraryIds.get(libraryName);
            if (libraryId != null) {
                final Histogram<Short>.Bin bin = opticalDuplicatesByLibraryId.get(libraryId);
                if (bin != null) {
                    metrics.READ_PAIR_OPTICAL_DUPLICATES = (long) bin.getValue();
                }
            }
            metrics.calculateDerivedMetrics();
            file.addMetric(metrics);
        }


        if (metricsByLibrary.size() == 1) {
            file.setHistogram(metricsByLibrary.values().iterator().next().calculateRoiHistogram());
        }


        file.write(METRICS_FILE);
    }


    /** Little class to generate program group IDs */
    static class PgIdGenerator {
        private int recordCounter;


        private final Set<String> idsThatAreAlreadyTaken = new HashSet<String>();


        PgIdGenerator(final SAMFileHeader header) {
            for (final SAMProgramRecord pgRecord : header.getProgramRecords()) {
                idsThatAreAlreadyTaken.add(pgRecord.getProgramGroupId());
            }
            recordCounter = idsThatAreAlreadyTaken.size();
        }


        String getNonCollidingId(final String recordId) {
            if (!idsThatAreAlreadyTaken.contains(recordId)) {
                // don't remap 1st record. If there are more records
                // with this id, they will be remapped in the 'else'.
                idsThatAreAlreadyTaken.add(recordId);
                ++recordCounter;
                return recordId;
            } else {
                String newId;
                // Below we tack on one of roughly 1.7 million possible 4 digit base36 at random. We do this because
                // our old process of just counting from 0 upward and adding that to the previous id led to 1000s of
                // calls idsThatAreAlreadyTaken.contains() just to resolve 1 collision when merging 1000s of similarly
                // processed bams.
                while (idsThatAreAlreadyTaken.contains(newId = recordId + "." + SamFileHeaderMerger.positiveFourDigitBase36Str(recordCounter++)))
                    ;


                idsThatAreAlreadyTaken.add(newId);
                return newId;
            }


        }
    }


    /** Little class used to package up a header and an iterable/iterator. */
    public static final class SamHeaderAndIterator {
        public final SAMFileHeader header;
        public final CloseableIterator<SAMRecord> iterator;


        public SamHeaderAndIterator(final SAMFileHeader header, final CloseableIterator<SAMRecord> iterator) {
            this.header = header;
            this.iterator = iterator;
        }
    }


    /**
     * Since this may read it's inputs more than once this method does all the opening
     * and checking of the inputs.
     */
    protected SamHeaderAndIterator openInputs() {
        final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(INPUT.size());
        final List<SAMFileReader> readers = new ArrayList<SAMFileReader>(INPUT.size());


        for (final File f : INPUT) {
            final SAMFileReader reader = new SAMFileReader(f, true); // eager decode
            final SAMFileHeader header = reader.getFileHeader();


            if (!ASSUME_SORTED && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
                throw new PicardException("Input file " + f.getAbsolutePath() + " is not coordinate sorted.");
            }


            headers.add(header);
            readers.add(reader);
        }


        if (headers.size() == 1) {
            return new SamHeaderAndIterator(headers.get(0), readers.get(0).iterator());
        } else {
            final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, false);
            final MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger, readers, ASSUME_SORTED);
            return new SamHeaderAndIterator(headerMerger.getMergedHeader(), iterator);
        }
    }


    /**
     * Looks through the set of reads and identifies how many of the duplicates are
     * in fact optical duplicates, and stores the data in the instance level histogram.
     */
    public static void trackOpticalDuplicates(List<? extends ReadEnds> ends,
                                              final OpticalDuplicateFinder opticalDuplicateFinder,
                                              final LibraryIdGenerator libraryIdGenerator) {
        boolean hasFR = false, hasRF = false;


        // Check to see if we have a mixture of FR/RF
        for (final ReadEnds end : ends) {
            if (ReadEnds.FR == end.orientationForOpticalDuplicates) {
                hasFR = true;
            } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) {
                hasRF = true;
            }
        }


        // Check if we need to partition since the orientations could have changed
        if (hasFR && hasRF) { // need to track them independently
            // Variables used for optical duplicate detection and tracking
            final List<ReadEnds> trackOpticalDuplicatesF = new ArrayList<ReadEnds>();
            final List<ReadEnds> trackOpticalDuplicatesR = new ArrayList<ReadEnds>();


            // Split into two lists: first of pairs and second of pairs, since they must have orientation and same starting end
            for (final ReadEnds end : ends) {
                if (ReadEnds.FR == end.orientationForOpticalDuplicates) {
                    trackOpticalDuplicatesF.add(end);
                } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) {
                    trackOpticalDuplicatesR.add(end);
                } else {
                    throw new PicardException("Found an unexpected orientation: " + end.orientation);
                }
            }


            // track the duplicates
            trackOpticalDuplicates(trackOpticalDuplicatesF, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap());
            trackOpticalDuplicates(trackOpticalDuplicatesR, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap());
        } else { // No need to partition
            AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(ends, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap());
        }
    }


    /**
     * Looks through the set of reads and identifies how many of the duplicates are
     * in fact optical duplicates, and stores the data in the instance level histogram.
     */
    private static void trackOpticalDuplicates(final List<? extends OpticalDuplicateFinder.PhysicalLocation> list,
                                               final OpticalDuplicateFinder opticalDuplicateFinder,
                                               final Histogram<Short> opticalDuplicatesByLibraryId) {
        final boolean[] opticalDuplicateFlags = opticalDuplicateFinder.findOpticalDuplicates(list);


        int opticalDuplicates = 0;
        for (final boolean b : opticalDuplicateFlags) if (b) ++opticalDuplicates;
        if (opticalDuplicates > 0) {
            opticalDuplicatesByLibraryId.increment(list.get(0).getLibraryId(), opticalDuplicates);
        }
    }
}
Source Code of picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram$SamHeaderAndIterator

Related Classes of picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram$SamHeaderAndIterator