Source Code of org.broad.igv.feature.tribble.UCSCGeneTableCodec

/*
 * Copyright (c) 2007-2012 The Broad Institute, Inc.
 * SOFTWARE COPYRIGHT NOTICE
 * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
 *
 * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 */


package org.broad.igv.feature.tribble;


import org.broad.igv.Globals;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Exon;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.util.StringUtils;


import java.util.List;


/**
 * @author Jim Robinson
 * @date 11/19/11
 */
public class UCSCGeneTableCodec extends UCSCCodec<BasicFeature> {


    private int nameColumn = 0;
    private int idColumn = 1;
    private int chrColumn = 2;
    private int strandColumn = 3;
    private int startColumn = 4;
    private int endColumn = 5;
    private int cdStartColumn = 6;
    private int cdEndColumn = 7;
    private int exonCountColumn = 8;
    private int startsBufferColumn = 9;
    private int endsBufferColumn = 10;
    private int frameBufferColumn = 15;


    public enum Type {


        REFFLAT, GENEPRED, UCSCGENE
    }


    private Genome genome;
    private Type type;




    public UCSCGeneTableCodec(Type type, Genome genome) {
        super(BasicFeature.class);


        this.genome = genome;
        this.type = type;
        switch (type) {
            case REFFLAT:
                break;
            case UCSCGENE:
                idColumn = 0;
                chrColumn = 1;
                strandColumn = 2;
                startColumn = 3;
                endColumn = 4;
                cdStartColumn = 5;
                cdEndColumn = 6;
                exonCountColumn = 7;
                startsBufferColumn = 8;
                endsBufferColumn = 9;
                nameColumn = 10;
                break;
            case GENEPRED:
                nameColumn = 12;
        }


    }


    /**
     * Decode a line as a Feature.
     *
     * @param line the input line to decode
     * @return Return the Feature encoded by the line,  or null if the line does not represent a feature (e.g. is
     * a comment)
     */
    public BasicFeature decode(String line) {
        if (line.startsWith("#")) {
            //Header line
            readHeaderLine(line);
            return null;
        }


        line = line.replaceAll("\"", "");
        String[] tokens = Globals.singleTabMultiSpacePattern.split(line);
        int tokenCount = tokens.length;


        if (tokenCount <= strandColumn) {
            return null;
        }


        String identifier = tokens[idColumn].trim();
        String name = null;
        if (tokenCount > nameColumn && tokens[nameColumn] != null) {
            name = tokens[nameColumn];
        }


        if (name == null || name.length() == nameColumn) {
            name = identifier;
        }


        String chrToken = tokens[chrColumn].trim();
        String chr = genome == null ? StringUtils.intern(chrToken) : genome.getChromosomeAlias(chrToken);


        int start = Integer.parseInt(tokens[startColumn]);
        int end = Integer.parseInt(tokens[endColumn]);
        String strandString = tokens[strandColumn];
        Strand strand = Strand.NONE;
        if (strandString != null) {
            if (strandString.trim().equals("+")) {
                strand = Strand.POSITIVE;
            } else if (strandString.trim().equals("-")) {
                strand = Strand.NEGATIVE;
            }
        }


        BasicFeature gene = new BasicFeature(chr, start, end, strand);


        gene.setName(name);
        gene.setIdentifier(identifier);


        if (tokenCount > 7) {
            gene.setThickStart(Integer.parseInt(tokens[6]));
            gene.setThickEnd(Integer.parseInt(tokens[7]));
        }


        // Coding information is optional
        if (tokenCount > 8) {
            createExons(tokens, tokenCount, gene, chr, strand);
        }


        // Optional standard name column
        if (tokenCount > 16) {
            gene.setAttribute("Standard Name", tokens[16]);
        }


        return gene;
    }


    /**
     * This function returns true iff the File potentialInput can be parsed by this
     * codec.
     * <p/>
     * There is an assumption that there's never a situation where two different Codecs
     * return true for the same file.  If this occurs, the recommendation would be to error out.
     * <p/>
     * Note this function must never throw an error.  All errors should be trapped
     * and false returned.
     *
     * @param path the file to test for parsability with this codec
     * @return true if potentialInput can be parsed, false otherwise
     */
    public boolean canDecode(String path) {
        return true;
    }




    private void createExons(String[] tokens, int tokenCount, BasicFeature gene, String chr,
                             Strand strand)
            throws NumberFormatException {


        int cdStart = Integer.parseInt(tokens[cdStartColumn]);
        int cdEnd = Integer.parseInt(tokens[cdEndColumn]);


        int exonCount = Integer.parseInt(tokens[exonCountColumn]);
        String[] startsBuffer = Globals.commaPattern.split(tokens[startsBufferColumn]);
        String[] endsBuffer = Globals.commaPattern.split(tokens[endsBufferColumn]);


        if (startsBuffer.length == endsBuffer.length) {


            int exonNumber = (strand == Strand.NEGATIVE ? exonCount : 1);


            for (int i = 0; i < startsBuffer.length; i++) {
                int exonStart = Integer.parseInt(startsBuffer[i]);
                int exonEnd = Integer.parseInt(endsBuffer[i]);
                Exon exon = new Exon(chr, exonStart, exonEnd, strand);
                exon.setCodingStart(cdStart);
                exon.setCodingEnd(cdEnd);
                gene.addExon(exon);


                exon.setNumber(exonNumber);
                if (strand == Strand.NEGATIVE) {
                    exonNumber--;
                } else {
                    exonNumber++;
                }


            }
        }


        List<Exon> exons = gene.getExons();


        // Walk through exons setting mRNA start position
        int start = strand == Strand.POSITIVE ? 0 : exonCount - 1;
        int end = strand == Strand.POSITIVE ? exonCount : -1;
        int increment = strand == Strand.POSITIVE ? 1 : -1;


        int mRNABase = 0;
        for (int i = start; i != end; i += increment) {
            Exon exon = exons.get(i);
            exon.setMrnaBase(mRNABase);
            mRNABase += exon.getCodingLength();
        }




        if (type == Type.GENEPRED && tokenCount > 15) {
            try {


                String[] frameBuffer = Globals.commaPattern.split(tokens[frameBufferColumn]);
                for (int i = 0; i < frameBuffer.length; i++) {
                    int exonFrame = Integer.parseInt(frameBuffer[i].trim());
                    if (exonFrame == -1) {
                        exons.get(i).setNonCoding(true);
                    } else {
                        int phase = (exonFrame == 0) ? 0 : 3 - exonFrame;
                        exons.get(i).setPhase(phase);
                    }
                }
            } catch (Exception e) {


                // Ignore -- not getting the reading frame is not the end of the world.
                //log.info("Could not parse frame buffer: ", e);
            }


        }
    }


}
Source Code of org.broad.igv.feature.tribble.UCSCGeneTableCodec

Related Classes of org.broad.igv.feature.tribble.UCSCGeneTableCodec