Package org.broadinstitute.gatk.tools.walkers.readutils

Source Code of org.broadinstitute.gatk.tools.walkers.readutils.PrintReads

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.readutils;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMReadGroupRecord;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Hidden;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.iterators.ReadTransformersMode;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.baq.BAQ;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;

import java.io.File;
import java.util.*;

/**
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
*
* <p>
* PrintReads can dynamically merge the contents of multiple input BAM files, resulting
* in merged output sorted in coordinate order.  Can also optionally filter reads based on the
* --read_filter command line argument.
* </p>
*
* <p>
* Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow,
* it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below.
* </p>
*
* <h3>Input</h3>
* <p>
* One or more bam files.
* </p>
*
* <h3>Output</h3>
* <p>
* A single processed bam file.
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T PrintReads \
*   -o output.bam \
*   -I input1.bam \
*   -I input2.bam \
*   --read_filter MappingQualityZero
*
* // Prints the first 2000 reads in the BAM file
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T PrintReads \
*   -o output.bam \
*   -I input.bam \
*   -n 2000
*
* // Downsamples BAM file to 25%
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T PrintReads \
*   -o output.bam \
*   -I input.bam \
*   -dfrac 0.25
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable {

    @Output(doc="Write output to this BAM filename instead of STDOUT")
    GATKSAMFileWriter out;

    @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
    String readGroup = null;

    /**
     * For example, --platform ILLUMINA or --platform 454.
     */
    @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
    String platform = null;

    /**
     * Only prints the first n reads of the file
     */
    @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
    int nReadsToPrint = -1;

    /**
     * Only reads from samples listed in the provided file(s) will be included in the output.
     */
    @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
    public Set<File> sampleFile = new TreeSet<>();

    /**
     * Only reads from the sample(s) will be included in the output.
     */
    @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
    public Set<String> sampleNames = new TreeSet<>();

    /**
     * Erase all extra attributes in the read but keep the read group information
     */
    @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false)
    public boolean simplifyReads = false;

    @Hidden
    @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
    public boolean NO_PG_TAG = false;

    List<ReadTransformer> readTransformers = Collections.emptyList();
    private Set<String> readGroupsToKeep = Collections.emptySet();

    public static final String PROGRAM_RECORD_NAME = "GATK PrintReads";   // The name that will go in the @PG tag
   
    Random random;


    /**
     * The initialize function.
     */
    public void initialize() {
        final GenomeAnalysisEngine toolkit = getToolkit();

        if ( toolkit != null )
            readTransformers = toolkit.getReadTransformers();

        //Sample names are case-insensitive
        final TreeSet<String> samplesToChoose = new TreeSet<>(new Comparator<String>() {
            @Override
            public int compare(String a, String b) {
                return a.compareToIgnoreCase(b);
            }
        });
        Collection<String> samplesFromFile;
        if (!sampleFile.isEmpty())  {
            samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
            samplesToChoose.addAll(samplesFromFile);
        }

        if (!sampleNames.isEmpty())
            samplesToChoose.addAll(sampleNames);

        random = GenomeAnalysisEngine.getRandomGenerator();

        if (toolkit != null) {
            final SAMFileHeader outputHeader = toolkit.getSAMFileHeader().clone();
            readGroupsToKeep = determineReadGroupsOfInterest(outputHeader, samplesToChoose);

            //If some read groups are to be excluded, remove them from the output header
            pruneReadGroups(outputHeader);

            //Add the program record (if appropriate) and set up the writer
            final boolean preSorted = true;
            if (toolkit.getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
                Utils.setupWriter(out, toolkit, outputHeader, preSorted, this, PROGRAM_RECORD_NAME);
            } else {
                out.writeHeader(outputHeader);
                out.setPresorted(preSorted);
            }

        }

    }

    /**
     * The reads filter function.
     *
     * @param ref  the reference bases that correspond to our read, if a reference was provided
     * @param read the read itself, as a GATKSAMRecord
     * @return true if the read passes the filter, false if it doesn't
     */
    public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
        // check that the read belongs to an RG that we need to keep
        if (!readGroupsToKeep.isEmpty()) {
            final SAMReadGroupRecord readGroup = read.getReadGroup();
            if (!readGroupsToKeep.contains(readGroup.getReadGroupId()))
                return false;
        }

        // check if we've reached the output limit
        if ( nReadsToPrint == 0 ) {
            return false;          // n == 0 means we've printed all we needed.
        }
        else if (nReadsToPrint > 0) {
            nReadsToPrint--;       // n > 0 means there are still reads to be printed.
        }

        return true;
    }

    /**
     * The reads map function.
     *
     * @param ref  the reference bases that correspond to our read, if a reference was provided
     * @param readIn the read itself, as a GATKSAMRecord
     * @return the read itself
     */
    public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
        GATKSAMRecord workingRead = readIn;

        for ( final ReadTransformer transformer : readTransformers ) {
            workingRead = transformer.apply(workingRead);
        }

        if ( simplifyReads ) workingRead = workingRead.simplify();

        return workingRead;
    }

    /**
     * reduceInit is called once before any calls to the map function.  We use it here to setup the output
     * bam file, if it was specified on the command line
     *
     * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
     */
    public SAMFileWriter reduceInit() {
        return out;
    }

    /**
     * given a read and a output location, reduce by emitting the read
     *
     * @param read   the read itself
     * @param output the output source
     * @return the SAMFileWriter, so that the next reduce can emit to the same source
     */
    public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) {
        output.addAlignment(read);
        return output;
    }

    /**
     * Determines the list of read groups that meet the user's criteria for inclusion (based on id, platform, or sample)
     * @param header         the merged header for all input files
     * @param samplesToKeep  the list of specific samples specified by the user
     * @return               a Set of read group IDs that meet the user's criteria, empty if all RGs should be included
     */
    private Set<String> determineReadGroupsOfInterest(final SAMFileHeader header, final Set<String> samplesToKeep) {
        //If no filter options that use read group information have been supplied, exit early
        if (platform == null && readGroup == null && samplesToKeep.isEmpty())
            return Collections.emptySet();

        if  ( platform != null )
            platform = platform.toUpperCase();

        final Set<String> result = new HashSet<>();
        for (final SAMReadGroupRecord rg : header.getReadGroups()) {
            // To be eligible for output, a read group must:
            //  NOT have an id that is blacklisted on the command line (note that String.equals(null) is false)
            //  AND NOT have a platform that contains the blacklisted platform from the command line
            //  AND have a sample that is whitelisted on the command line
            if (!rg.getReadGroupId().equals(readGroup) &&
                    (platform == null || !rg.getPlatform().toUpperCase().contains(platform)) &&
                    (samplesToKeep.isEmpty() || samplesToKeep.contains(rg.getSample())))
                result.add(rg.getReadGroupId());
        }

        if (result.isEmpty())
            throw new UserException.BadArgumentValue("-sn/-sf/-platform/-readGroup", "No read groups remain after pruning based on the supplied parameters");

        return result;
    }

    private void pruneReadGroups(final SAMFileHeader header) {
        if (readGroupsToKeep.isEmpty())
            return;

        final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
        for (final SAMReadGroupRecord rg : header.getReadGroups()) {
            if (readGroupsToKeep.contains(rg.getReadGroupId()))
                readGroups.add(rg);
        }
        header.setReadGroups(readGroups);
    }
}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.readutils.PrintReads

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.