Package picard.sam

Source Code of picard.sam.DownsampleSam

package picard.sam;

import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;

import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;

/**
* Class to randomly downsample a BAM file while respecting that we should either get rid
* of both ends of a pair or neither end of the pair!
*/
@CommandLineProgramProperties(
        usage = "Randomly down-sample a SAM or BAM file to retain " +
                "a random subset of the reads. Mate-pairs are either both kept or both discarded. Reads marked as not primary " +
                "alignments are all discarded. Each read is given a probability P of being retained - results with the exact " +
                "same input in the same order and with the same value for RANDOM_SEED will produce the same results.",
        usageShort = "Down-sample a SAM or BAM file to retain a random subset of the reads",
        programGroup = SamOrBam.class
)
public class DownsampleSam extends CommandLineProgram {

    @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The input SAM or BAM file to downsample.")
    public File INPUT;

    @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The output, downsampled, SAM or BAM file to write.")
    public File OUTPUT;

    @Option(shortName="R", doc="Random seed to use if reproducibilty is desired.  " +
            "Setting to null will cause multiple invocations to produce different results.")
    public Long RANDOM_SEED = 1L;

    @Option(shortName="P", doc="The probability of keeping any individual read, between 0 and 1.")
    public double PROBABILITY = 1;

    private final Log log = Log.getInstance(DownsampleSam.class);

    public static void main(final String[] args) {
        new DownsampleSam().instanceMainWithExit(args);
    }

    @Override
    protected int doWork() {
        IOUtil.assertFileIsReadable(INPUT);
        IOUtil.assertFileIsWritable(OUTPUT);

        final Random r = RANDOM_SEED == null ? new Random() : new Random(RANDOM_SEED);
        final SAMFileReader in = new SAMFileReader(INPUT);
        final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
        final Map<String,Boolean> decisions = new HashMap<String,Boolean>();

        long total = 0;
        long kept  = 0;
       
        final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Read");

        for (final SAMRecord rec : in) {
            if (rec.isSecondaryOrSupplementary()) continue;
            ++total;

            final String key = rec.getReadName();
            final Boolean previous = decisions.remove(key);
            final boolean keeper;

            if (previous == null) {
                keeper = r.nextDouble() <= PROBABILITY;
                if (rec.getReadPairedFlag()) decisions.put(key, keeper);
            }
            else {
                keeper = previous;
            }

            if (keeper) {
                out.addAlignment(rec);
                ++kept;
            }

            progress.record(rec);
        }

        out.close();
        log.info("Finished! Kept " + kept + " out of " + total + " reads.");

        return 0;
    }
}
TOP

Related Classes of picard.sam.DownsampleSam

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.