Package edu.wustl.genome.samtools

Source Code of edu.wustl.genome.samtools.GCSamToFastq

/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package edu.wustl.genome.samtools;

import net.sf.picard.PicardException;
import net.sf.picard.cmdline.CommandLineProgram;
import net.sf.picard.cmdline.Option;
import net.sf.picard.cmdline.StandardOptionDefinitions;
import net.sf.picard.cmdline.Usage;
import net.sf.picard.fastq.FastqRecord;
import net.sf.picard.fastq.FastqWriter;
import net.sf.picard.fastq.FastqWriterFactory;
import net.sf.picard.io.IoUtil;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMUtils;
import net.sf.samtools.util.SequenceUtil;
import net.sf.samtools.util.StringUtil;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* Extracts read sequences and qualities from the input SAM/BAM file and writes them into
* the output file in Sanger fastq format.
* See <a href="http://maq.sourceforge.net/fastq.shtml">MAQ FastQ specification</a> for details.
* In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome,
* the read's sequence from input sam file will be reverse-complemented prior to writing it to fastq in order restore correctly
* the original read sequence as it was generated by the sequencer.
*/
public class GCSamToFastq extends CommandLineProgram {
    @Usage
    public String USAGE = "Extracts read sequences and qualities from the input SAM/BAM file and writes them into "+
        "the output file in Sanger fastq format. In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, "+
        "the read's sequence from input SAM file will be reverse-complemented prior to writing it to fastq in order restore correctly "+
        "the original read sequence as it was generated by the sequencer.";

    @Option(doc="Input SAM/BAM file to extract reads from", shortName=StandardOptionDefinitions.INPUT_SHORT_NAME)
    public File INPUT ;

    @Option(shortName="F", doc="Output fastq file (single-end fastq or, if paired, first end of the pair fastq).", mutex={"OUTPUT_PER_RG"})
    public File FASTQ ;

    @Option(shortName="F2", doc="Output fastq file (if paired, second end of the pair fastq).", optional=true, mutex={"OUTPUT_PER_RG"})
    public File SECOND_END_FASTQ ;

    @Option(shortName="FRGF", doc="Fragment output fastq file", optional=true, mutex={"OUTPUT_PER_RG"})
    public File FRAGMENT_FASTQ;

    @Option(shortName="OPRG", doc="Output a fastq file per read group (two fastq files per read group if the group is paired).", optional=true, mutex={"FASTQ", "SECOND_END_FASTQ","FRAGMENT_FASTQ"})
    public boolean OUTPUT_PER_RG ;

    @Option(shortName="ODIR", doc="Directory in which to output the fastq file(s).  Used only when OUTPUT_PER_RG is true.", optional=true)
    public File OUTPUT_DIR;

    @Option(shortName="RC", doc="Re-reverse bases and qualities of reads with negative strand flag set before writing them to fastq", optional=true)
    public boolean RE_REVERSE = true;

    @Option(shortName="NON_PF", doc="Include non-PF reads from the SAM file into the output FASTQ files.")
    public boolean INCLUDE_NON_PF_READS = false;

    @Option(shortName="NO_ORPHAN", doc="Do not warn on orphaned mates when one side of the pair was filtered")
    public boolean IGNORE_ORPHAN_MATES = false;

    @Option(shortName="CLIP_ATTR", doc="The attribute that stores the position at which " +
            "the SAM record should be clipped", optional=true)
    public String CLIPPING_ATTRIBUTE;

    @Option(shortName="CLIP_ACT", doc="The action that should be taken with clipped reads: " +
            "'X' means the reads and qualities should be trimmed at the clipped position; " +
            "'N' means the bases should be changed to Ns in the clipped region; and any " +
            "integer means that the base qualities should be set to that value in the " +
            "clipped region.", optional=true)
    public String CLIPPING_ACTION;


    public static void main(final String[] argv) {
        System.exit(new GCSamToFastq().instanceMain(argv));
    }

    private static FastqWriterFactory fastqWriterFactory;

    private static FastqWriterFactory FastqWriterFactoryInstance(){
      if(fastqWriterFactory == null){
        fastqWriterFactory = new FastqWriterFactory();
      }
      return fastqWriterFactory;
    }

    protected int doWork() {
        IoUtil.assertFileIsReadable(INPUT);
       
        if (OUTPUT_PER_RG) {
            doGrouped();
        } else {
            IoUtil.assertFileIsWritable(FASTQ);

            if (SECOND_END_FASTQ == null) {
                doUnpaired();
            }
            else {
                doPaired();
            }
        }
        return 0;
    }

    protected void doUnpaired() {

        final SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(INPUT));
        final FastqWriter writer = FastqWriterFactoryInstance().newWriter(FASTQ);

        for (final SAMRecord record : reader ) {
            if (record.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) {
                // do nothing
            }
            else {
                writeRecord(record, null, writer);
            }
        }
        reader.close();
        writer.close();
    }

    protected void notifyOrphan(String name) {
        if (IGNORE_ORPHAN_MATES == false)
            throw new PicardException("Read "+name+" is orphaned!");
    }

    protected void doPaired() {
        IoUtil.assertFileIsWritable(SECOND_END_FASTQ);
        IoUtil.assertFileIsWritable(FRAGMENT_FASTQ);

        final SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(INPUT));
        final FastqWriter writer1 = FastqWriterFactoryInstance().newWriter(FASTQ);
        final FastqWriter writer2 = FastqWriterFactoryInstance().newWriter(SECOND_END_FASTQ);
        final FastqWriter fragWriter = FastqWriterFactoryInstance().newWriter(FRAGMENT_FASTQ);
        final Map<String,SAMRecord> firstSeenMates = new HashMap<String,SAMRecord>();
        final Set<String> failedReadNames = new HashSet<String>();

        try {

            for (final SAMRecord currentRecord : reader ) {

                final String currentReadName = currentRecord.getReadName() ;
                final SAMRecord firstRecord = firstSeenMates.get(currentReadName);

                // Skip non-PF reads as necessary
                if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) {
                    if (currentRecord.getReadPairedFlag()) {
                        failedReadNames.add(currentReadName);
                        // if this record failed QC, but we were already holding its mate...
                        if (firstRecord != null) {
                            firstSeenMates.remove(currentReadName);
                            notifyOrphan(currentReadName);
                            writeRecord(firstRecord, null, fragWriter);
                        }
                    }
                    continue;
                }

                if (currentRecord.getReadPairedFlag()) {
                    // if this reads mate already failed QC...
                    if (failedReadNames.contains(currentReadName)) {
                        notifyOrphan(currentReadName);
                        writeRecord(currentRecord, null, fragWriter);
                        continue;
                    }

                    if (firstRecord == null) {
                        firstSeenMates.put(currentReadName, currentRecord) ;
                    }
                    else {
                        assertPairedMates(firstRecord, currentRecord);

                        if (currentRecord.getFirstOfPairFlag()) {
                             writeRecord(currentRecord, 1, writer1);
                             writeRecord(firstRecord, 2, writer2);
                        }
                        else {
                             writeRecord(firstRecord, 1, writer1);
                             writeRecord(currentRecord, 2, writer2);
                        }
                        firstSeenMates.remove(currentReadName);
                    }
                } else {
                    writeRecord(currentRecord, null, fragWriter);
                }
            }

            if (firstSeenMates.size() > 0)  {
               
                // are we ignoring these.   
                if (IGNORE_ORPHAN_MATES == false)
                    throw new PicardException("Found "+firstSeenMates.size()+" reads with flags that indicated they were paired, but no mates were seen.");
           
                for (final SAMRecord currentRecord : firstSeenMates.values()) {
                    writeRecord(currentRecord, null, fragWriter);
                }

            }

        } finally {
            // Flush as much as possible.
            writer1.close();
            writer2.close();
            fragWriter.close();
            reader.close();
        }

    }

    protected void doGrouped()
    {
        final SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(INPUT));
        final Map<String,SAMRecord> firstSeenMates = new HashMap<String,SAMRecord>();
        final Map<SAMReadGroupRecord, List<FastqWriter>> writers = new HashMap<SAMReadGroupRecord, List<FastqWriter>>();

        for (final SAMRecord currentRecord : reader ) {
            // Skip non-PF reads as necessary
            if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) continue;

            if(currentRecord.getReadPairedFlag())
            {
                doGroupedPaired(firstSeenMates, writers, currentRecord);
            }
            else
            {
                doGroupedUnpaired(writers, currentRecord);
            }
        }

        if (firstSeenMates.size() > 0 && IGNORE_ORPHAN_MATES == false) {
            throw new PicardException("Found "+firstSeenMates.size()+" unpaired mates");
        }

        reader.close();
        for(final List<FastqWriter> writerPair : writers.values()){
            for(final FastqWriter fq : writerPair){
                fq.close();
            }
        }
    }

    protected void doGroupedPaired(final Map<String,SAMRecord> firstSeenMates,
                                final Map<SAMReadGroupRecord, List<FastqWriter>> writers, final SAMRecord currentRecord) {
        final String currentReadName = currentRecord.getReadName() ;
        final SAMRecord firstRecord = firstSeenMates.remove(currentReadName);

        if (firstRecord == null) {
            firstSeenMates.put(currentReadName, currentRecord) ;
        }
        else {
            assertPairedMates(firstRecord, currentRecord);

            final SAMReadGroupRecord readGroup = currentRecord.getReadGroup();
            List<FastqWriter> writerPair;
            writerPair = writers.get(readGroup);
            if(writerPair == null)
            {
                final File fq1 = makeReadGroupFile(readGroup, "_1");
                IoUtil.assertFileIsWritable(fq1);
                final File fq2 = makeReadGroupFile(readGroup, "_2");
                IoUtil.assertFileIsWritable(fq2);

                writerPair = new ArrayList<FastqWriter>();
                writerPair.add(FastqWriterFactoryInstance().newWriter(fq1));
                writerPair.add(FastqWriterFactoryInstance().newWriter(fq2));
                writers.put(readGroup, writerPair);
            }

            if (currentRecord.getFirstOfPairFlag()) {
                 writeRecord(currentRecord, 1, writerPair.get(0));
                 writeRecord(firstRecord, 2, writerPair.get(1));
            }
            else {
                 writeRecord(firstRecord, 1, writerPair.get(0));
                 writeRecord(currentRecord, 2, writerPair.get(1));
            }
        }
    }

    protected void doGroupedUnpaired(final Map<SAMReadGroupRecord, List<FastqWriter>> writers, final SAMRecord currentRecord) {
        final SAMReadGroupRecord readGroup = currentRecord.getReadGroup();
        final SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(INPUT));

        List<FastqWriter> writerList = writers.get(readGroup);
        if(writerList == null){
            final File fq1 = makeReadGroupFile(readGroup, null);
            IoUtil.assertFileIsWritable(fq1);

            writerList = new ArrayList<FastqWriter>();
            writerList.add(FastqWriterFactoryInstance().newWriter(fq1));
        }

        writeRecord(currentRecord, null, writerList.get(0));
        reader.close();
    }

    private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String preExtSuffix) {
        String fileName = readGroup.getPlatformUnit();
        if (fileName == null) fileName = readGroup.getReadGroupId();
        fileName = IoUtil.makeFileNameSafe(fileName);
        if(preExtSuffix != null) fileName += preExtSuffix;
        fileName += ".fastq";

        if(OUTPUT_DIR != null) return new File(OUTPUT_DIR, fileName);
        return new File(fileName);
    }

    void writeRecord(final SAMRecord read, final Integer mateNumber, final FastqWriter writer) {
        final String seqHeader = mateNumber==null ? read.getReadName() : read.getReadName() + "/"+ mateNumber;
        String readString = read.getReadString();
        String baseQualities = read.getBaseQualityString();

        // If we're clipping, do the right thing to the bases or qualities
        if (CLIPPING_ATTRIBUTE != null) {
            final Integer clipPoint = (Integer)read.getAttribute(CLIPPING_ATTRIBUTE);
            if (clipPoint != null) {
                if (CLIPPING_ACTION.equalsIgnoreCase("X")) {
                    readString = clip(readString, clipPoint, null,
                            !read.getReadNegativeStrandFlag());
                    baseQualities = clip(baseQualities, clipPoint, null,
                            !read.getReadNegativeStrandFlag());

                }
                else if (CLIPPING_ACTION.equalsIgnoreCase("N")) {
                    readString = clip(readString, clipPoint, 'N',
                            !read.getReadNegativeStrandFlag());
                }
                else {
                    final char newQual = SAMUtils.phredToFastq(
                            new byte[] { (byte)Integer.parseInt(CLIPPING_ACTION)}).charAt(0);
                    baseQualities = clip(baseQualities, clipPoint, newQual,
                            !read.getReadNegativeStrandFlag());
                }
            }
        }
        if ( RE_REVERSE && read.getReadNegativeStrandFlag() ) {
            readString = SequenceUtil.reverseComplement(readString);
            baseQualities = StringUtil.reverseString(baseQualities);
        }
        writer.write(new FastqRecord(seqHeader, readString, "", baseQualities));

    }

    /**
     * Utility method to handle the changes required to the base/quality strings by the clipping
     * parameters.
     *
     * @param src           The string to clip
     * @param point         The 1-based position of the first clipped base in the read
     * @param replacement   If non-null, the character to replace in the clipped positions
     *                      in the string (a quality score or 'N').  If null, just trim src
     * @param posStrand     Whether the read is on the positive strand
     * @return String       The clipped read or qualities
     */
    private String clip(final String src, final int point, final Character replacement, final boolean posStrand) {
        final int len = src.length();
        String result = posStrand ? src.substring(0, point-1) : src.substring(len-point+1);
        if (replacement != null) {
            if (posStrand) {
                for (int i = point; i <= len; i++ ) {
                    result += replacement;
                }
            }
            else {
                for (int i = 0; i <= len-point; i++) {
                    result = replacement + result;
                }
            }
        }
        return result;
    }

    private void assertPairedMates(final SAMRecord record1, final SAMRecord record2) {
        if (! (record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() ||
               record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag() ) ) {
            throw new PicardException("Illegal mate state: " + record1.getReadName());
        }
    }


    /**
    * Put any custom command-line validation in an override of this method.
    * clp is initialized at this point and can be used to print usage and access argv.
     * Any options set by command-line parser can be validated.
    * @return null if command line is valid.  If command line is invalid, returns an array of error
    * messages to be written to the appropriate place.
    */
    protected String[] customCommandLineValidation() {
        if ((CLIPPING_ATTRIBUTE != null && CLIPPING_ACTION == null) ||
            (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) {
            return new String[] {
                    "Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set." };
        }
        if (CLIPPING_ACTION != null) {
            if (CLIPPING_ACTION.equals("N") || CLIPPING_ACTION.equals("X")) {
                // Do nothing, this is fine
            }
            else {
                try {
                    Integer.parseInt(CLIPPING_ACTION);
                }
                catch (NumberFormatException nfe) {
                    return new String[] {"CLIPPING ACTION must be one of: N, X, or an integer"};
                }
            }
        }

        return null;
    }
}
TOP

Related Classes of edu.wustl.genome.samtools.GCSamToFastq

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.