Package org.broad.igv.feature.tribble

Source Code of org.broad.igv.feature.tribble.UCSCGeneTableCodec

/*
* Copyright (c) 2007-2012 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/

package org.broad.igv.feature.tribble;

import org.broad.igv.Globals;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Exon;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.util.StringUtils;

import java.util.List;

/**
* @author Jim Robinson
* @date 11/19/11
*/
public class UCSCGeneTableCodec extends UCSCCodec<BasicFeature> {

    private int nameColumn = 0;
    private int idColumn = 1;
    private int chrColumn = 2;
    private int strandColumn = 3;
    private int startColumn = 4;
    private int endColumn = 5;
    private int cdStartColumn = 6;
    private int cdEndColumn = 7;
    private int exonCountColumn = 8;
    private int startsBufferColumn = 9;
    private int endsBufferColumn = 10;
    private int frameBufferColumn = 15;

    public enum Type {

        REFFLAT, GENEPRED, UCSCGENE
    }

    private Genome genome;
    private Type type;


    public UCSCGeneTableCodec(Type type, Genome genome) {
        super(BasicFeature.class);

        this.genome = genome;
        this.type = type;
        switch (type) {
            case REFFLAT:
                break;
            case UCSCGENE:
                idColumn = 0;
                chrColumn = 1;
                strandColumn = 2;
                startColumn = 3;
                endColumn = 4;
                cdStartColumn = 5;
                cdEndColumn = 6;
                exonCountColumn = 7;
                startsBufferColumn = 8;
                endsBufferColumn = 9;
                nameColumn = 10;
                break;
            case GENEPRED:
                nameColumn = 12;
        }

    }

    /**
     * Decode a line as a Feature.
     *
     * @param line the input line to decode
     * @return Return the Feature encoded by the line,  or null if the line does not represent a feature (e.g. is
     * a comment)
     */
    public BasicFeature decode(String line) {
        if (line.startsWith("#")) {
            //Header line
            readHeaderLine(line);
            return null;
        }

        line = line.replaceAll("\"", "");
        String[] tokens = Globals.singleTabMultiSpacePattern.split(line);
        int tokenCount = tokens.length;

        if (tokenCount <= strandColumn) {
            return null;
        }

        String identifier = tokens[idColumn].trim();
        String name = null;
        if (tokenCount > nameColumn && tokens[nameColumn] != null) {
            name = tokens[nameColumn];
        }

        if (name == null || name.length() == nameColumn) {
            name = identifier;
        }

        String chrToken = tokens[chrColumn].trim();
        String chr = genome == null ? StringUtils.intern(chrToken) : genome.getChromosomeAlias(chrToken);

        int start = Integer.parseInt(tokens[startColumn]);
        int end = Integer.parseInt(tokens[endColumn]);
        String strandString = tokens[strandColumn];
        Strand strand = Strand.NONE;
        if (strandString != null) {
            if (strandString.trim().equals("+")) {
                strand = Strand.POSITIVE;
            } else if (strandString.trim().equals("-")) {
                strand = Strand.NEGATIVE;
            }
        }

        BasicFeature gene = new BasicFeature(chr, start, end, strand);

        gene.setName(name);
        gene.setIdentifier(identifier);

        if (tokenCount > 7) {
            gene.setThickStart(Integer.parseInt(tokens[6]));
            gene.setThickEnd(Integer.parseInt(tokens[7]));
        }

        // Coding information is optional
        if (tokenCount > 8) {
            createExons(tokens, tokenCount, gene, chr, strand);
        }

        // Optional standard name column
        if (tokenCount > 16) {
            gene.setAttribute("Standard Name", tokens[16]);
        }

        return gene;
    }

    /**
     * This function returns true iff the File potentialInput can be parsed by this
     * codec.
     * <p/>
     * There is an assumption that there's never a situation where two different Codecs
     * return true for the same file.  If this occurs, the recommendation would be to error out.
     * <p/>
     * Note this function must never throw an error.  All errors should be trapped
     * and false returned.
     *
     * @param path the file to test for parsability with this codec
     * @return true if potentialInput can be parsed, false otherwise
     */
    public boolean canDecode(String path) {
        return true;
    }


    private void createExons(String[] tokens, int tokenCount, BasicFeature gene, String chr,
                             Strand strand)
            throws NumberFormatException {

        int cdStart = Integer.parseInt(tokens[cdStartColumn]);
        int cdEnd = Integer.parseInt(tokens[cdEndColumn]);

        int exonCount = Integer.parseInt(tokens[exonCountColumn]);
        String[] startsBuffer = Globals.commaPattern.split(tokens[startsBufferColumn]);
        String[] endsBuffer = Globals.commaPattern.split(tokens[endsBufferColumn]);

        if (startsBuffer.length == endsBuffer.length) {

            int exonNumber = (strand == Strand.NEGATIVE ? exonCount : 1);

            for (int i = 0; i < startsBuffer.length; i++) {
                int exonStart = Integer.parseInt(startsBuffer[i]);
                int exonEnd = Integer.parseInt(endsBuffer[i]);
                Exon exon = new Exon(chr, exonStart, exonEnd, strand);
                exon.setCodingStart(cdStart);
                exon.setCodingEnd(cdEnd);
                gene.addExon(exon);

                exon.setNumber(exonNumber);
                if (strand == Strand.NEGATIVE) {
                    exonNumber--;
                } else {
                    exonNumber++;
                }

            }
        }

        List<Exon> exons = gene.getExons();

        // Walk through exons setting mRNA start position
        int start = strand == Strand.POSITIVE ? 0 : exonCount - 1;
        int end = strand == Strand.POSITIVE ? exonCount : -1;
        int increment = strand == Strand.POSITIVE ? 1 : -1;

        int mRNABase = 0;
        for (int i = start; i != end; i += increment) {
            Exon exon = exons.get(i);
            exon.setMrnaBase(mRNABase);
            mRNABase += exon.getCodingLength();
        }


        if (type == Type.GENEPRED && tokenCount > 15) {
            try {

                String[] frameBuffer = Globals.commaPattern.split(tokens[frameBufferColumn]);
                for (int i = 0; i < frameBuffer.length; i++) {
                    int exonFrame = Integer.parseInt(frameBuffer[i].trim());
                    if (exonFrame == -1) {
                        exons.get(i).setNonCoding(true);
                    } else {
                        int phase = (exonFrame == 0) ? 0 : 3 - exonFrame;
                        exons.get(i).setPhase(phase);
                    }
                }
            } catch (Exception e) {

                // Ignore -- not getting the reading frame is not the end of the world.
                //log.info("Could not parse frame buffer: ", e);
            }

        }
    }

}
TOP

Related Classes of org.broad.igv.feature.tribble.UCSCGeneTableCodec

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.