Source Code of org.broad.igv.feature.tribble.GFFCodec$GFF2Helper

/*
 * Copyright (c) 2007-2012 The Broad Institute, Inc.
 * SOFTWARE COPYRIGHT NOTICE
 * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
 *
 * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 */


package org.broad.igv.feature.tribble;


import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.FeatureDB;
import org.broad.igv.feature.SequenceOntology;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackProperties;
import org.broad.igv.ui.IGV;
import org.broad.igv.ui.color.ColorUtilities;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.StringUtils;
import org.broad.igv.util.collections.CI;
import org.broad.igv.util.collections.MultiMap;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.exception.CodecLineParsingException;
import htsjdk.tribble.readers.LineIterator;


import java.util.*;


/**
 * Notes from GFF3 spec  http://www.sequenceontology.org/gff3.shtml
 * These tags have predefined meanings (tags are case sensitive):
 * <p/>
 * ID     Indicates the name of the feature (unique).
 * Name   Display name for the feature.
 * Alias  A secondary name for the feature.
 * Parent Indicates the parent of the feature.
 * <p/>
 * Specs:
 * GFF3  http://www.sequenceontology.org/gff3.shtml
 * GFF2 specification: http://www.sanger.ac.uk/resources/software/gff/spec.html
 * UCSC GFF (GFF "1") http://genome.ucsc.edu/FAQ/FAQformat#format3
 * GTF  http://mblab.wustl.edu/GTF2.html
 * UCSC GTF  http://genome.ucsc.edu/FAQ/FAQformat#format4
 * Feature type definitions http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html#7.2
 */
public class GFFCodec extends AsciiFeatureCodec<Feature> {


    private static Logger log = Logger.getLogger(GFFCodec.class);




    static HashSet<String> ignoredTypes = new HashSet();


    static {
        ignoredTypes.add("start_codon");
        ignoredTypes.add("stop_codon");
        ignoredTypes.add("Contig");
        ignoredTypes.add("RealContig");
        ignoredTypes.add("CDS_parts");
    }




    private TrackProperties trackProperties = null;
    private CI.CIHashSet featuresToHide = new CI.CIHashSet();
    private FeatureFileHeader header;
    private Helper helper;
    private Genome genome;
    private boolean fastaSection = false;


    public enum Version {
        GFF2, GFF3
    }




    /**
     * List of known "Name" fields.  Some important fields from the GFF3 spec are listed below.  Note GFF3
     * is case sensitive, however GFF2, GTF, and other variants might not be.
     * <p/>
     * ID    Indicates the ID of the feature.
     * Name   Display name for the feature.
     * Alias  A secondary name for the feature.
     */
    static String[] nameFields = {"Name", "name", "Alias", "gene", "primary_name", "locus", "alias", "systematic_id", "ID", "transcript_id"};




    public GFFCodec(Genome genome) {
        super(Feature.class);
        // Assume GFF2 until shown otherwise
        helper = new GFF2Helper();
        this.genome = genome;
    }


    public GFFCodec(Version version, Genome genome) {
        super(Feature.class);
        this.genome = genome;
        if (version == Version.GFF2) {
            helper = new GFF2Helper();
        } else {
            helper = new GFF3Helper();
        }
    }


    public void readHeaderLine(String line) {
        if (header == null) {
            header = new FeatureFileHeader();
        }
        if (line.startsWith("#track") || line.startsWith("##track")) {
            trackProperties = new TrackProperties();
            ParsingUtils.parseTrackLine(line, trackProperties);
            header.setTrackProperties(trackProperties);
        } else if (line.startsWith("##gff-version") && line.endsWith("3")) {
            helper = new GFF3Helper();
        } else if (line.startsWith("#nodecode") || line.startsWith("##nodecode")) {
            helper.setUrlDecoding(false);
        } else if (line.startsWith("#hide") || line.startsWith("##hide")) {
            String[] kv = line.split("=");
            if (kv.length > 1) {
                featuresToHide.addAll(Arrays.asList(kv[1].split(",")));
            }
        } else if (line.startsWith("#displayName") || line.startsWith("##displayName")) {
            String[] nameTokens = line.split("=");
            if (nameTokens.length < 2) {
                helper.setNameFields(null);
            } else {
                String[] fields = nameTokens[1].split(",");
                helper.setNameFields(fields);
            }
        }
    }


    public Object readActualHeader(LineIterator reader) {


        if (header == null) {
            header = new FeatureFileHeader();
        }
        String line;
        int nLines = 0;
        try {
            while (reader.hasNext()) {
                line = reader.peek();
                if (line.startsWith("#")) {
                    nLines++;
                    readHeaderLine(line);
                    reader.next();
                } else {
                    break;
                }
            }


            header.setTrackProperties(trackProperties);
            return header;
        } catch (Exception e) {
            throw new CodecLineParsingException("Error parsing header: " + e.getMessage(), e);
        }
    }


    /**
     * This function returns true iff the File potentialInput can be parsed by this
     * codec.
     * <p/>
     * There is an assumption that there's never a situation where two different Codecs
     * return true for the same file.  If this occurs, the recommendation would be to error out.
     * <p/>
     * Note this function must never throw an error.  All errors should be trapped
     * and false returned.
     *
     * @param path the file to test for parsability with this codec
     * @return true if potentialInput can be parsed, false otherwise
     */
    public boolean canDecode(String path) {
        final String pathLowerCase = path.toLowerCase();
        return pathLowerCase.endsWith(".gff") || pathLowerCase.endsWith(".gff3") ||
                pathLowerCase.endsWith(".gvf") || pathLowerCase.endsWith(".gtf");
    }


    public BasicFeature decodeLoc(String line) {
        return decode(line);
    }


    public BasicFeature decode(String line) {


        if (fastaSection) {
            return null;
        }
        if (line.startsWith("#")) {
            if (line.toUpperCase().startsWith("##FASTA")) {
                fastaSection = true;
            }
            return null;
        }


        String[] tokens = Globals.tabPattern.split(line, -1);
        int nTokens = tokens.length;


        // GFF3 files have 9 tokens,
        // TODO -- the attribute column is optional for GFF 2 and earlier (8 tokens required)
        if (nTokens < 9) {
            return null;
        }


        String chrToken = tokens[0].trim();
        String featureType = StringUtils.intern(tokens[2].trim());


        if (ignoredTypes.contains(featureType)) {
            return null;
        }


        String chromosome = genome == null ? StringUtils.intern(chrToken) : genome.getChromosomeAlias(chrToken);


        // GFF coordinates are 1-based inclusive (length = end - start + 1)
        // IGV (UCSC) coordinates are 0-based exclusive.  Adjust start and end accordingly
        int start;
        int end;
        int col = 3;
        try {
            start = Integer.parseInt(tokens[col]) - 1;
            if (start < 0) throw new ParserException("Start index must be 1 or larger; GFF is 1-based", -1, line);
            col++;
            end = Integer.parseInt(tokens[col]);
        } catch (NumberFormatException ne) {
            String msg = String.format("Column %d must contain a numeric value. %s", col + 1, ne.getMessage());
            throw new ParserException(msg, -1, line);
        }
        Strand strand = convertStrand(tokens[6]);


        String attributeString = tokens[8];


        MultiMap<String, String> attributes = new MultiMap<String, String>();


        helper.parseAttributes(attributeString, attributes);


        String id = helper.getID(attributes, featureType);
        String[] parentIds = helper.getParentIds(attributes, attributeString);


        BasicFeature f = new BasicFeature(chromosome, start, end, strand);




        // Set "thick start/end" => corresponds to coding start & end, for UTRs
        if (SequenceOntology.utrTypes.contains(featureType)) {
            boolean plus = (SequenceOntology.fivePrimeUTRTypes.contains(featureType) && strand == Strand.POSITIVE) ||
                    (SequenceOntology.threePrimeUTRTypes.contains(featureType) && strand == Strand.NEGATIVE);
            if (plus) {
                f.setThickStart(end);
            } else {
                f.setThickEnd(end);
            }
        }


        String phaseString = tokens[7].trim();
        if (!phaseString.equals(".")) {
            int phaseNum = Integer.parseInt(phaseString);
            f.setReadingFrame(phaseNum);
        }


        f.setName(helper.getName(attributes));
        f.setType(featureType);


        id = id != null ? id : "igv_" + UUID.randomUUID().toString();
        f.setIdentifier(id);


        f.setParentIds(parentIds);
        f.setAttributes(attributes);


        String[] colorNames = new String[]{"color", "Color", "colour", "Colour"};
        for (String colorName : colorNames) {
            if (attributes.containsKey(colorName)) {
                f.setColor(ColorUtilities.stringToColor(attributes.get(colorName)));
                break;
            }
        }


        if (featuresToHide.contains(featureType)) {
            if (IGV.hasInstance()) FeatureDB.addFeature(f, genome);
            return null;
        }


        return f;


    }


    public Object getHeader() {
        return header;
    }


    private Strand convertStrand(String strandString) {
        Strand strand = Strand.NONE;
        if (strandString.equals("-")) {
            strand = Strand.NEGATIVE;
        } else if (strandString.equals("+")) {
            strand = Strand.POSITIVE;
        }


        return strand;
    }


    static StringBuffer buf = new StringBuffer();


    static String getDescription(MultiMap<String, String> attributes, String type) {
        buf.setLength(0);
        buf.append(type);
        buf.append("<br>");
        attributes.printHtml(buf, 100);
        return buf.toString();
    }




    protected interface Helper {


        String[] getParentIds(MultiMap<String, String> attributes, String attributeString);


        void parseAttributes(String attributeString, MultiMap<String, String> map);


        String getID(MultiMap<String, String> attributes, String type);


        void setUrlDecoding(boolean b);


        String getName(MultiMap<String, String> attributes);


        void setNameFields(String[] fields);


    }


    public static class GFF2Helper implements Helper {


        //TODO Almost identical
        static String[] DEFAULT_NAME_FIELDS = {"alias", "gene", "ID", "Locus", "locus", "Name", "name", "primary_name", "systematic_id", "transcript_id"};
        static List<String> idFields = new ArrayList<String>(Arrays.asList(DEFAULT_NAME_FIELDS));


        static {
            idFields.add("transcript_id");
        }


        static String[] possParentNames = new String[]{"transcript_id", "id", "mRNA", "systematic_id", "gene", "transcriptId", "Parent", "proteinId"};


        private String[] nameFields;


        GFF2Helper() {
            this(DEFAULT_NAME_FIELDS);
        }


        GFF2Helper(String[] nameFields) {
            if (nameFields != null) {
                this.nameFields = nameFields;
            }


        }


        public void setUrlDecoding(boolean b) {
            // Ignored,  GFF2 files are never url DECODED
        }




        public void parseAttributes(String description, MultiMap<String, String> kvalues) {


            List<String> kvPairs = StringUtils.breakQuotedString(description.trim(), ';');
            for (String kv : kvPairs) {
                String[] tokens = kv.split(" ");
                if (tokens.length == 1) {
                    //Not space delimited, check =
                    tokens = kv.split("=");
                }
                if (tokens.length >= 2) {
                    String key = tokens[0].trim().replaceAll("\"", "");
                    String value = tokens[1].trim().replaceAll("\"", "");
                    kvalues.put(StringUtils.intern(key), value);
                }
            }
        }


        /**
         * @param attributes
         * @param attributeString
         * @return
         */


        public String[] getParentIds(MultiMap<String, String> attributes, String attributeString) {


            if (attributes.size() > 0) {
                for (String possName : possParentNames) {
                    if (attributes.containsKey(possName)) {
                        String parent = attributes.get(possName).trim();
                        if (parent.length() > 0) {
                            return new String[] {parent};
                        }
                    }
                }
            }
            return null;
        }




        public String getID(MultiMap<String, String> attributes, String type) {


            //Search for an attribute == type,  take this as ID
            String id = attributes.get(type);
            if (id != null && id.length() > 0) {
                return id;
            }


            for (String nf : idFields) {
                if (attributes.containsKey(nf)) {
                    String tmp = attributes.get(nf).trim();
                    if(tmp.length() > 0) return tmp;
                }
            }


            String tmp = getName(attributes);
            if(tmp != null && tmp.trim().length() > 0) {
                return tmp.trim();
            }


            return null;
        }


        public String getName(MultiMap<String, String> attributes) {


            if (attributes.size() > 0 && nameFields != null) {
                for (String nf : nameFields) {
                    if (attributes.containsKey(nf)) {
                        return attributes.get(nf);
                    }
                }
            }


            return null;
        }


        public void setNameFields(String[] nameFields) {
            this.nameFields = nameFields;
        }


    }


    public static class GFF3Helper implements Helper {


        static String[] DEFAULT_NAME_FIELDS = {"Name", "Alias", "ID", "gene", "locus", "gene_name"};
        private boolean useUrlDecoding = true;


        private String[] nameFields;


        public GFF3Helper() {
            this(DEFAULT_NAME_FIELDS);
        }


        GFF3Helper(String[] nameFields) {
            if (nameFields != null) {
                this.nameFields = nameFields;
            }


        }




        public String[] getParentIds(MultiMap<String, String> attributes, String ignored) {
            String parentIdString = attributes.get("Parent");
            if (parentIdString != null) {
                return parentIdString.split(",");
            } else {
                return null;
            }
        }


        /**
         * Parse the column 9 attributes.  Attributes are separated by semicolons.
         * <p/>
         * TODO -- quotes (column 9) are explicitly forbidden in GFF3 -- should breakQuotedString be used?
         *
         * @param description
         * @param kvalues
         */
        public void parseAttributes(String description, MultiMap<String, String> kvalues) {


            List<String> kvPairs = StringUtils.breakQuotedString(description.trim(), ';');
            for (String kv : kvPairs) {
                //int nValues = ParsingUtils.split(kv, tmp, '=');
                List<String> tmp = StringUtils.breakQuotedString(kv, '=');
                int nValues = tmp.size();
                if (nValues > 0) {
                    String key = tmp.get(0).trim();
                    String value = ((nValues == 1) ? "" : tmp.get(1).trim());


                    if (useUrlDecoding) {
                        key = StringUtils.decodeURL(key);
                        value = StringUtils.decodeURL(value);
                    }
                    kvalues.put(StringUtils.intern(key), value);
                } else {
                    log.info("No attributes: " + description);
                }
            }
        }


        public void setUrlDecoding(boolean useUrlDecoding) {
            this.useUrlDecoding = useUrlDecoding;
        }


        public String getName(MultiMap<String, String> attributes) {


            if (attributes.size() > 0 && nameFields != null) {
                for (String nf : nameFields) {
                    if (attributes.containsKey(nf)) {
                        return attributes.get(nf);
                    }
                }
            }


            return null;
        }


        public String getID(MultiMap<String, String> attributes, String ignore) {
            return attributes.get("ID");
        }


        public void setNameFields(String[] nameFields) {
            this.nameFields = nameFields;
        }
    }




}
Source Code of org.broad.igv.feature.tribble.GFFCodec$GFF2Helper

Related Classes of org.broad.igv.feature.tribble.GFFCodec$GFF2Helper