Package picard.illumina.parser

Source Code of picard.illumina.parser.BclParser

/*
* The MIT License
*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina.parser;


import htsjdk.samtools.util.CloseableIterator;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.illumina.parser.readers.BclReader;

import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

import static htsjdk.samtools.util.CollectionUtil.makeSet;

/**
* BclParser parses a number of BclFiles equal to the total of all the values in outputLengths and returns a BclData object
* segmented based on these lengths.  The only client of this class should be IlluminaDataProvider and an test classes.  See BclReader for
* more information on BclFiles.  BclParser provides support for reading BaseCalls and QualityScores.
*/
class BclParser extends PerTileCycleParser<BclData> {
    private static final int EAMSS_M2_GE_THRESHOLD = 30;
    private static final int EAMSS_S1_LT_THRESHOLD = 15; //was 15
    public static final byte MASKING_QUALITY = (byte) 0x02;

    private static final Set<IlluminaDataType> SUPPORTED_TYPES = Collections.unmodifiableSet(makeSet(IlluminaDataType.BaseCalls, IlluminaDataType.QualityScores));

    protected final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
    private final boolean applyEamssFilter;

    public BclParser(final File directory, final int lane, final CycleIlluminaFileMap tilesToCycleFiles, final OutputMapping outputMapping, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy) {
        this(directory, lane, tilesToCycleFiles, outputMapping, true, bclQualityEvaluationStrategy);
        this.initialize();
    }

    public BclParser(final File directory, final int lane, final CycleIlluminaFileMap tilesToCycleFiles, final OutputMapping outputMapping, final boolean applyEamssFilter, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy) {
        super(directory, lane, tilesToCycleFiles, outputMapping);
        this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
        this.applyEamssFilter = applyEamssFilter;
        this.initialize();
    }

    /**
     * Create a Bcl parser for an individual cycle and wrap it with the CycleFilesParser interface which populates
     * the correct cycle in BclData.
     *
     * @param files The files to parse.
     * @return A CycleFilesParser that populates a BclData object with data for a single cycle
     */
    @Override
    protected CycleFilesParser<BclData> makeCycleFileParser(final List<File> files) {
        return new BclDataCycleFileParser(files);
    }

    @Override
    public void initialize() {
        seekToTile(currentTile);
    }

    @Override
    public Set<IlluminaDataType> supportedTypes() {
        return SUPPORTED_TYPES;
    }

    @Override
    public BclData next() {
        final BclData bclData = super.next();

        final byte[][] bases = bclData.getBases();
        final byte[][] qualities = bclData.getQualities();

        //first run EAMSS
        if (this.applyEamssFilter) {
            for (int i = 0; i < bases.length; i++) {
                runEamssForReadInPlace(bases[i], qualities[i]);
            }
        }

        return bclData;
    }

    /**
     * EAMSS is an Illumina Developed Algorithm for detecting reads whose quality has deteriorated towards
     * their end and revising the quality to the masking quality (2) if this is the case.  This algorithm
     * works as follows (with one exception):
     * <p/>
     * Start at the end (high indices, at the right below) of the read and calculate an EAMSS tally at each
     * location as follow:
     * if(quality[i] < 15) tally += 1
     * if(quality[i] >= 15 and < 30) tally = tally
     * if(quality[i] >= 30) tally -= 2
     * <p/>
     * <p/>
     * For each location, keep track of this tally (e.g.)
     * Read Starts at <- this end
     * Cycle:       1  2  3  4  5  6  7  8  9
     * Bases:       A  C  T  G  G  G  T  C  A
     * Qualities:   32 32 16 15 8  10 32 2  2
     * Cycle Score: -2 -2 0  0  1  1  -2 1  1           //The EAMSS Score determined for this cycle alone
     * EAMSS TALLY: 0  0  2  2  2  1  0  2  1
     * X - Earliest instance of Max-Score
     * <p/>
     * You must keep track of the maximum EAMSS tally (in this case 2) and the earliest(lowest) cycle at which
     * it occurs.  If and only if, the max EAMSS tally >= 1 then from there until the end(highest cycle) of the
     * read reassign these qualities as 2 (the masking quality).  The output qualities would therefore be
     * transformed from:
     * <p/>
     * Original Qualities: 32 32 16 15 8  10 32 2  2    to
     * Final    Qualities: 32 32 2  2  2  2  2  2  2
     * X - Earliest instance of max-tally/end of masking
     * <p/>
     * IMPORTANT:
     * The one exception is: If the max EAMSS Tally is preceded by a long string of G basecalls (10 or more, with a single basecall exception per10 bases)
     * then the masking continues to the beginning of that string of G's. E.g.:
     * <p/>
     * Cycle:       1  2  3  4  5  6  7  8   9  10 11 12 13 14 15 16 17 18
     * Bases:       C  T  A  C  A  G  A  G   G  G  G  G  G  G  G  C  A  T
     * Qualities:   30 22 26 27 28 30 7  34  20 19 38 15 32 32 10 4  2  5
     * Cycle Score: -2  0  0  0  0 -2 1  -2  0  0  -2 0  -2 -2  1 1  1  1
     * EAMSS TALLY: -2 -5 -5 -5 -5 -5 -3 -4 -2 -2  -2 0   0  2  4 3  2  1
     * X- Earliest instance of Max-Tally
     * <p/>
     * Resulting Transformation:
     * Bases:                C  T  A  C  A  G  A   G   G  G  G  G  G  G  G  C  A  T
     * Original Qualities:   30 22 26 27 28 30 7  34  20 19 38 15 32 32 10  4  2  5
     * Final    Qualities:   30 22 26 27 28  2 2   2   2  2  2  2  2  2  2  2  2  2
     * X- Earliest instance of Max-Tally
     * X - Start of EAMSS masking due to G-Run
     * <p/>
     * To further clarify the exception rule here are a few examples:
     * A C G A C G G G G G G G G G G G G G G G G G G G G A C T
     * X - Earliest instance of Max-Tally
     * X - Start of EAMSS masking (with a two base call jump because we have 20 bases in the run already)
     * <p/>
     * T T G G A G G G G G G G G G G G G G G G G G G A G A C T
     * X - Earliest instance of Max-Tally
     * X - We can skip this A as well as the earlier A because we have 20 or more bases in the run already
     * X - Start of EAMSS masking (with a two base call jump because we have 20 bases in the run)
     * <p/>
     * T T G G G A A G G G G G G G G G G G G G G G G G G T T A T
     * X - Earliest instance of Max-Tally
     * X X - WE can skip these bases because the first A counts as the first skip and as far as the length of the string of G's is
     * concerned, these are both counted like G's
     * X - This A is the 20th base in the string of G's and therefore can be skipped
     * X - Note that the A's previous to the G's are only included because there are G's further on that are within the number
     * of allowable exceptions away (i.e. 2 in this instance), if there were NO G's after the A's you CANNOT count the A's
     * as part of the G strings (even if no exceptions have previously occured) In other words, the end of the string of G's
     * MUST end in a G not an "exception"
     * <p/>
     * However, if the max-tally occurs to the right of the run of Gs then this is still part of the string of G's but does count towards
     * the number of exceptions allowable
     * (e.g.)
     * T T G G G G G G G G G G A C G
     * X - Earliest instance of Max-tally
     * The first index CAN be considered as an exception, the above would be masked to
     * the following point:
     * T T G G G G G G G G G G A C G
     * X - End of EAMSS masking due to G-Run
     * <p/>
     * To sum up the final points, a string of G's CAN START with an exception but CANNOT END in an exception.
     *
     * @param bases     Bases for a single read in the cluster ( not the entire cluster )
     * @param qualities Qualities for a single read in the cluster ( not the entire cluster )
     */
    protected static void runEamssForReadInPlace(final byte[] bases, final byte[] qualities) {
        int eamssTally = 0;
        int maxTally = Integer.MIN_VALUE;
        int indexOfMax = -1;

        for (int i = bases.length - 1; i >= 0; i--) {
            final int quality = (0xff & qualities[i]);

            if (quality >= EAMSS_M2_GE_THRESHOLD) {
                eamssTally -= 2;
            } else if (quality < EAMSS_S1_LT_THRESHOLD) {
                eamssTally += 1;
            }

            if (eamssTally >= maxTally) {
                indexOfMax = i;
                maxTally = eamssTally;
            }
        }

        if (maxTally >= 1) {
            int numGs = 0;
            int exceptions = 0;

            for (int i = indexOfMax; i >= 0; i--) {
                if (bases[i] == 'G') {
                    ++numGs;
                } else {
                    final Integer skip = skipBy(i, numGs, exceptions, bases);
                    if (skip != null) {
                        exceptions += skip;
                        numGs += skip;
                        i -= (skip - 1);
                    } else {
                        break;
                    }
                }
            }

            if (numGs >= 10) {
                indexOfMax = (indexOfMax + 1) - numGs;
            }

            for (int i = indexOfMax; i < qualities.length; i++) {
                qualities[i] = MASKING_QUALITY;
            }
        }
    }

    /**
     * Determine whether or not the base at index is part of a skippable section in a run of G's, if so
     * return the number of bases that the section is composed of.
     *
     * @param index          Current index, which should be the index of a non-'G' base
     * @param numGs          The number of bases in the current string of G's for this read
     * @param prevExceptions The number of exceptions previously detected in this string by this method
     * @param bases          The bases of this read
     * @return If we have not reached our exception limit (1/every 10bases) and a G is within exceptionLimit(numGs/10)
     * indices before the current index then return index - (index of next g), else return null  Null indicates this is
     * NOT a skippable region, if we run into index 0 without finding a g then NULL is also returned
     */
    private static Integer skipBy(final int index, final int numGs, final int prevExceptions, final byte[] bases) {
        Integer skip = null;
        for (int backup = 1; backup <= index; backup++) {
            final int exceptionLimit = Math.max((numGs + backup) / 10, 1);
            if (prevExceptions + backup > exceptionLimit) {
                break;
            }
            if (bases[index - backup] == 'G') {
                skip = backup;
                break;
            }
        }

        return skip;
    }

    private class BclDataCycleFileParser implements CycleFilesParser<BclData> {
        final CloseableIterator<BclData> reader;

        public BclDataCycleFileParser(final List<File> files) {
            reader = new BclReader(files, outputMapping.getOutputReadLengths(),
                    bclQualityEvaluationStrategy, false);
        }

        @Override
        public void close() {
            reader.close();
        }

        @Override
        public BclData next() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }
            return reader.next();
        }

        @Override
        public boolean hasNext() {
            try {
                return reader.hasNext();
            } catch (final NullPointerException npe) {
                return false;
            }
        }
    }
}
TOP

Related Classes of picard.illumina.parser.BclParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.