Source Code of org.broad.igv.feature.tribble.PSLCodec

/*
 * Copyright (c) 2007-2011 by The Broad Institute of MIT and Harvard.  All Rights Reserved.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 *
 * THE SOFTWARE IS PROVIDED "AS IS." THE BROAD AND MIT MAKE NO REPRESENTATIONS OR
 * WARRANTES OF ANY KIND CONCERNING THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING,
 * WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER
 * OR NOT DISCOVERABLE.  IN NO EVENT SHALL THE BROAD OR MIT, OR THEIR RESPECTIVE
 * TRUSTEES, DIRECTORS, OFFICERS, EMPLOYEES, AND AFFILIATES BE LIABLE FOR ANY DAMAGES
 * OF ANY KIND, INCLUDING, WITHOUT LIMITATION, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
 * ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER
 * THE BROAD OR MIT SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT
 * SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
 */


package org.broad.igv.feature.tribble;


import org.broad.igv.Globals;
import org.broad.igv.feature.PSLRecord;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Exon;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;


/**
 * @author jrobinso
 * @date Aug 5, 2010
 * <p/>
 * Columns:
 * <p/>
 * 1. matches - Number of bases that match that aren't repeats
 * 2. misMatches - Number of bases that don't match
 * 3. repMatches - Number of bases that match but are part of repeats
 * 4. nCount - Number of 'N' bases
 * 5. qNumInsert - Number of inserts in query
 * 6. qBaseInsert - Number of bases inserted in query
 * 7. tNumInsert - Number of inserts in target
 * 8. tBaseInsert - Number of bases inserted in target
 * 9. strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand
 * 10. qName - Query sequence name
 * 11. qSize - Query sequence size
 * 12. qStart - Alignment start position in query
 * 13. qEnd - Alignment end position in query
 * 14. tName - Target sequence name
 * 15. tSize - Target sequence size
 * 16. tStart - Alignment start position in target
 * 17. tEnd - Alignment end position in target
 * 18. blockCount - Number of blocks in the alignment (a block contains no gaps)
 * 19. blockSizes - Comma-separated list of sizes of each block
 * 20. qStarts - Comma-separated list of starting positions of each block in query
 * 21. tStarts - Comma-separated list of starting positions of each block in target
 */
public class PSLCodec extends UCSCCodec<BasicFeature> {


    Genome genome;
    boolean keepText;


    public PSLCodec(){
        this(null);
    }


    public PSLCodec(Genome genome) {
        this(genome, false);
    }


    public PSLCodec(Genome genome, boolean keepText) {
        super(BasicFeature.class);
        this.genome = genome;
        this.keepText = keepText;


    }


    public PSLRecord decode(String line) {


        PSLRecord f = null;
        try {
            if (line.trim().length() == 0 || line.startsWith("#") || line.startsWith("track") ||
                    line.startsWith("browser")) {
                return null;
            }


            String[] tokens = Globals.singleTabMultiSpacePattern.split(line);
            int nTokens = tokens.length;
            if (nTokens < 21) {
                // log.info("Skipping line ")
                return null;
            }
            int tSize = Integer.parseInt(tokens[14]);
            String chrToken = tokens[13];
            String chr = genome == null ? chrToken : genome.getChromosomeAlias(chrToken);
            int start = Integer.parseInt(tokens[15]); // IS PSL 1 or ZERO based,  closed or open?


            String strandString = tokens[8];
            Strand strand = strandString.startsWith("+") ? Strand.POSITIVE : Strand.NEGATIVE;


            boolean gNeg = false;
            if (strandString.length() > 1) {
                gNeg = strandString.charAt(1) == '-';
            }


            f = new PSLRecord();
            f.setName(tokens[9]);
            f.setChr(chr);
            f.setStart(start);
            f.setEnd(Integer.parseInt(tokens[16]));
            f.setStrand(strand);


            int exonCount = Integer.parseInt(tokens[17]);
            String[] exonSizes = tokens[18].split(",");
            String[] startsBuffer = tokens[20].split(",");


            if (startsBuffer.length == exonSizes.length && exonCount == startsBuffer.length) {
                for (int i = 0; i < startsBuffer.length; i++) {
                    int exonSize = Integer.parseInt(exonSizes[i]);
                    int exonStart = Integer.parseInt(startsBuffer[i]);
                    if (gNeg) {
                        exonStart = tSize - exonStart - exonSize;
                    }
                    int exonEnd = exonStart + exonSize;
                    Exon exon = new Exon(chr, exonStart, exonEnd, strand);
                    f.addExon(exon);
                }
            } else {
                // TODO -- warn
            }


            //score = percentId = 100.0 * (match + repMatch)  / (misMatch + match + repMatch + qGapCount + tGapCount)
            int match = Integer.parseInt(tokens[0]);
            int misMatch = Integer.parseInt(tokens[1]);
            int repMatch = Integer.parseInt(tokens[2]);
            int ns = Integer.parseInt(tokens[3]);
            int qGapCount = Integer.parseInt(tokens[4]);
            int qGapBases = Integer.parseInt(tokens[5]);
            int tGapCount = Integer.parseInt(tokens[6]);
            int tGapBases = Integer.parseInt(tokens[7]);
            int qSize = Integer.parseInt(tokens[10]);


            float score = (1000.0f * (match + repMatch - misMatch - qGapCount - tGapCount)) / qSize;


            f.setMatch(match);
            f.setMisMatch(misMatch);
            f.setRepMatch(repMatch);
            f.setNs(ns);
            f.setQGapCount(qGapCount);
            f.setQGapBases(qGapBases);
            f.setTGapCount(tGapCount);
            f.setTGapBases(tGapBases);
            f.setQSize(qSize);


            f.setScore(score);


            // Build description
            StringBuffer desc = new StringBuffer();
            desc.append("matches = " + tokens[0]);
            desc.append("<br>");
            desc.append("mismatches = " + tokens[1]);
            desc.append("<br>");
            desc.append("repeat matches = " + tokens[2]);
            desc.append("<br>");
            desc.append("# inserts in query = " + tokens[4]);
            desc.append("<br>");
            desc.append("# inserts in target = " + tokens[6]);
            f.setDescription(desc.toString());


            if(keepText) {
                f.setText(line);
            }


        } catch (NumberFormatException e) {
            return null;
        }




        return f;
    }


    /**
     * This function returns true iff the File potentialInput can be parsed by this
     * codec.
     * <p/>
     * There is an assumption that there's never a situation where two different Codecs
     * return true for the same file.  If this occurs, the recommendation would be to error out.
     * <p/>
     * Note this function must never throw an error.  All errors should be trapped
     * and false returned.
     *
     * @param path the file to test for parsability with this codec
     * @return true if potentialInput can be parsed, false otherwise
     */
    public boolean canDecode(String path) {
        return path.toLowerCase().endsWith(".psl");
    }
}
Source Code of org.broad.igv.feature.tribble.PSLCodec

Related Classes of org.broad.igv.feature.tribble.PSLCodec