Package org.broad.igv.feature.genome

Source Code of org.broad.igv.feature.genome.Genome

/*
* Copyright (c) 2007-2012 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/


/*
* Genome.java
*
* Created on November 9, 2007, 9:05 AM
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package org.broad.igv.feature.genome;

import org.apache.log4j.Logger;
import org.broad.igv.DirectoryManager;
import org.broad.igv.Globals;
import org.broad.igv.feature.Chromosome;
import org.broad.igv.feature.Cytoband;
import org.broad.igv.track.FeatureTrack;
import org.broad.igv.ui.util.MessageUtils;

import java.io.*;
import java.util.*;

/**
* Simple model of a genome.  Keeps an ordered list of Chromosomes, an alias table, and genome position offsets
* for each chromosome to support a whole-genome view.
*/
public class Genome {

    private static Logger log = Logger.getLogger(Genome.class);
    public static final int MAX_WHOLE_GENOME = 10000;

    private String id;
    private String displayName;
    private List<String> chromosomeNames;
    private ArrayList<String> longChromosomeNames;
    private LinkedHashMap<String, Chromosome> chromosomeMap;
    private long totalLength = -1;
    private long nominalLength = -1;
    private Map<String, Long> cumulativeOffsets = new HashMap();
    private Map<String, String> chrAliasTable;
    private Sequence sequence;
    private FeatureTrack geneTrack;
    private String species;

    /**
     * @param id
     * @param displayName
     * @param sequence       the reference Sequence object.  Can be null.
     * @param chromosOrdered Whether the chromosomes are already ordered. If false, they will be sorted.
     */
    public Genome(String id, String displayName, Sequence sequence, boolean chromosOrdered) {
        this.id = id;
        this.displayName = displayName;
        this.chrAliasTable = new HashMap<String, String>();
        this.sequence = sequence;
        chromosomeNames = sequence.getChromosomeNames();

        List<Chromosome> tmpChromos = new ArrayList<Chromosome>(chromosomeNames.size());
        int maxLength = -1;
        chromosomeMap = new LinkedHashMap<String, Chromosome>(tmpChromos.size());

        for (int i = 0; i < chromosomeNames.size(); i++) {
            String chr = chromosomeNames.get(i);
            int length = sequence.getChromosomeLength(chr);
            maxLength = Math.max(maxLength, length);
            Chromosome chromo = new Chromosome(i, chr, length);
            tmpChromos.add(chromo);

            if (chromosOrdered) {
                chromosomeMap.put(chr, chromo);
            }
        }

        if (!chromosOrdered) {
            ChromosomeComparator.sortChromosomeList(tmpChromos, maxLength / 10, chromosomeMap);
            chromosomeNames = new ArrayList<String>(chromosomeMap.keySet());
        }

        initializeChromosomeAliases();
    }


    /**
     * Alternate constructor for defining a minimal genome, usually from parsing a chrom.sizes file.
     *
     * @param id
     * @param chromosomes
     */
    public Genome(String id, List<Chromosome> chromosomes) {
        this.id = id;
        this.displayName = id;
        this.chrAliasTable = new HashMap<String, String>();
        this.sequence = null;

        chromosomeNames = new ArrayList<String>(chromosomes.size());
        chromosomeMap = new LinkedHashMap<String, Chromosome>(chromosomes.size());
        for (Chromosome chromosome : chromosomes) {
            chromosomeNames.add(chromosome.getName());
            chromosomeMap.put(chromosome.getName(), chromosome);
        }
        initializeChromosomeAliases();

    }


    public String getChromosomeAlias(String str) {
        if (str == null) {
            return str;
        } else {
            //We intern strings used as chromosomes
            //to prevent storing multiple times
            if (!chrAliasTable.containsKey(str)) {
                chrAliasTable.put(str, str);
            }
            return chrAliasTable.get(str);
        }
    }

    public Map<String, String> getChrAliasTable() {
        return chrAliasTable;
    }


    /**
     * Populate the chr alias table.  The input is a collection of chromosome synonym lists.  The
     * directionality is determined by the "true" chromosome names.
     *
     * @param synonymsList
     */
    public void addChrAliases(Collection<Collection<String>> synonymsList) {

        if(chrAliasTable == null) chrAliasTable = new HashMap<String, String>();

        // Convert names to a set for fast "contains" testing.
        Set<String> chrNameSet = new HashSet<String>(chromosomeNames);

        for (Collection<String> synonyms : synonymsList) {

            // Find the chromosome name as used in this genome
            String chr = null;
            for (String syn : synonyms) {
                if (chrNameSet.contains(syn)) {
                    chr = syn;
                    break;
                }
            }

            // If found register aliases
            if (chr != null) {
                for (String syn : synonyms) {
                    chrAliasTable.put(syn, chr);
                }
            } else {
                // Nothing to do.  SHould this be logged?
            }
        }
    }


    /**
     * Update the chromosome alias table with common variations
     */
    void initializeChromosomeAliases() {
        chrAliasTable.putAll(getAutoAliases());

    }


    Map<String, String> getAutoAliases() {

        Map<String, String> autoAliases = new HashMap<String, String>();

        for (String name : chromosomeNames) {
            if (name.startsWith("gi|")) {
                // NCBI
                String alias = getNCBIName(name);
                autoAliases.put(alias, name);
            }
        }

        if (chromosomeNames.size() < 10000) {
            for (String name : chromosomeNames) {

                // UCSC Conventions
                if (name.toLowerCase().startsWith("chr")) {
                    autoAliases.put(name.substring(3), name);
                } else {
                    autoAliases.put("chr" + name, name);
                }
            }


            // These are legacy mappings,  these are now defined in the genomes alias file
            if (id.startsWith("hg") || id.equalsIgnoreCase("1kg_ref"))

            {
                autoAliases.put("23", "chrX");
                autoAliases.put("24", "chrY");
                autoAliases.put("MT", "chrM");
            } else if (id.startsWith("mm"))

            {
                autoAliases.put("21", "chrX");
                autoAliases.put("22", "chrY");
                autoAliases.put("MT", "chrM");
            } else if (id.equals("b37"))

            {
                autoAliases.put("chrM", "MT");
                autoAliases.put("chrX", "23");
                autoAliases.put("chrY", "24");

            }

            Collection<Map.Entry<String, String>> aliasEntries = new ArrayList(autoAliases.entrySet());
            for (Map.Entry<String, String> aliasEntry : aliasEntries) {
                // Illumina conventions
                String alias = aliasEntry.getKey();
                String chr = aliasEntry.getValue();
                if (!alias.endsWith(".fa")) {
                    String illuminaName = alias + ".fa";
                    autoAliases.put(illuminaName, chr);
                }
                if (!chr.endsWith(".fa")) {
                    String illuminaName = chr + ".fa";
                    autoAliases.put(illuminaName, chr);
                }
            }
        }
        return autoAliases;
    }
    /**
     * Extract the user friendly name from an NCBI accession
     * example: gi|125745044|ref|NC_002229.3|  =>  NC_002229.3
     */
    public static String getNCBIName(String name) {

        String[] tokens = name.split("\\|");
        return tokens[tokens.length - 1];
    }


    /**
     * Return the chromosome name associated with the "home" button,  usually the whole genome chromosome.
     *
     * @return
     */
    public String getHomeChromosome() {
        if (chromosomeNames.size() == 1 || chromosomeNames.size() > MAX_WHOLE_GENOME) {
            return chromosomeNames.get(0);
        } else {
            return Globals.CHR_ALL;
        }
    }


    public Chromosome getChromosome(String chrName) {
        return chromosomeMap.get(getChromosomeAlias(chrName));
    }


    public List<String> getAllChromosomeNames() {
        return chromosomeNames;
    }


    public Collection<Chromosome> getChromosomes() {
        return chromosomeMap.values();
    }


    public long getTotalLength() {
        if (totalLength < 0) {
            totalLength = 0;
            for (Chromosome chr : chromosomeMap.values()) {
                totalLength += chr.getLength();
            }
        }
        return totalLength;
    }


    public long getCumulativeOffset(String chr) {

        Long cumOffset = cumulativeOffsets.get(chr);
        if (cumOffset == null) {
            long offset = 0;
            for (String c : getLongChromosomeNames()) {
                if (chr.equals(c)) {
                    break;
                }
                offset += getChromosome(c).getLength();
            }
            cumOffset = new Long(offset);
            cumulativeOffsets.put(chr, cumOffset);
        }
        return cumOffset.longValue();
    }

    /**
     * Covert the chromosome coordinate in basepairs to genome coordinates in kilo-basepairs
     *
     * @param chr
     * @param locationBP
     * @return The overall genome coordinate, in kilo-bp
     */
    public int getGenomeCoordinate(String chr, int locationBP) {
        return (int) ((getCumulativeOffset(chr) + locationBP) / 1000);
    }

    /**
     * Translate a genome coordinate, in kilo-basepairs, to a chromosome & position in basepairs.
     *
     * @param genomeKBP The "genome coordinate" in kilo-basepairs.  This is the distance in kbp from the start of the
     *                  first chromosome.
     * @return the position on the corresponding chromosome
     */
    public ChromosomeCoordinate getChromosomeCoordinate(int genomeKBP) {

        long cumOffset = 0;
        List<String> wgChrNames = getLongChromosomeNames();
        for (String c : wgChrNames) {
            int chrLen = getChromosome(c).getLength();
            if ((cumOffset + chrLen) / 1000 > genomeKBP) {
                int bp = (int) (genomeKBP * 1000 - cumOffset);
                return new ChromosomeCoordinate(c, bp);
            }
            cumOffset += chrLen;
        }


        String c = wgChrNames.get(wgChrNames.size() - 1);
        int bp = (int) (genomeKBP - cumOffset) * 1000;
        return new ChromosomeCoordinate(c, bp);
    }

    /**
     * Method description
     *
     * @return
     */
    public String getId() {
        return id;
    }

    public String getSpecies() {
        if (species == null) {
            species = Genome.getSpeciesForID(id);
        }
        return species;
    }

    public String getNextChrName(String chr) {
        List<String> chrList = getLongChromosomeNames();
        for (int i = 0; i < chrList.size() - 1; i++) {
            if (chrList.get(i).equals(chr)) {
                return chrList.get(i + 1);
            }
        }
        return null;
    }

    public String getPrevChrName(String chr) {
        List<String> chrList = getLongChromosomeNames();
        for (int i = chrList.size() - 1; i > 0; i--) {
            if (chrList.get(i).equals(chr)) {
                return chrList.get(i - 1);
            }
        }
        return null;
    }

    /**
     * Return the nucleotide sequence on the + strand for the genomic interval.  This method can return null
     * if sequence is not available.
     *
     * @param chr
     * @param start start position in "zero-based" coordinates
     * @param end   end position
     * @return sequence, or null if not available
     * @api
     */
    public byte[] getSequence(String chr, int start, int end) {

        if (sequence == null) {
            return null;
        }

        Chromosome c = getChromosome(chr);
        if (c == null) {
            return null;
        }
        end = Math.min(end, c.getLength());
        if (end <= start) {
            return null;
        }
        return sequence.getSequence(chr, start, end);
    }

    public String getDisplayName() {
        return displayName;
    }

    /**
     * Return the reference base at the given position.  Can return null if reference sequence is unknown
     *
     * @param chr
     * @param pos
     * @return the reference base, or null if unknown
     */
    public byte getReference(String chr, int pos) {
        return sequence == null ? null : sequence.getBase(chr, pos);
    }


    public void setCytobands(LinkedHashMap<String, List<Cytoband>> chrCytoMap) {

        for (Map.Entry<String, List<Cytoband>> entry : chrCytoMap.entrySet()) {
            String chr = entry.getKey();
            List<Cytoband> cytobands = entry.getValue();

            Chromosome chromosome = chromosomeMap.get(chr);
            if (chromosome != null) {
                chromosome.setCytobands(cytobands);
            }
        }

    }

    public void setGeneTrack(FeatureTrack geneFeatureTrack) {
        this.geneTrack = geneFeatureTrack;
    }

    /**
     * Return the annotation track associated with this genome.   Can return null
     *
     * @return a FeatureTrack, or null
     */
    public FeatureTrack getGeneTrack() {
        return geneTrack;
    }

    /**
     * Return "getChromosomeNames()" with small chromosomes removed.
     *
     * @return
     */
    public List<String> getLongChromosomeNames() {
        if (longChromosomeNames == null) {
            longChromosomeNames = new ArrayList<String>(getAllChromosomeNames().size());
            long genomeLength = getTotalLength();
            int maxChromoLength = -1;
            for (String chrName : getAllChromosomeNames()) {
                Chromosome chr = getChromosome(chrName);
                int length = chr.getLength();
                maxChromoLength = Math.max(maxChromoLength, length);
                if (length > (genomeLength / 3000)) {
                    longChromosomeNames.add(chrName);
                }
            }

            /**
             * At this point, we should have some long chromosome names.
             * However, some genomes (draft ones perhaps) maybe have many small ones
             * which aren't big enough. We arbitrarily take those which are above
             * half the size of the max, only if the first method didn't work.
             */
            if (longChromosomeNames.size() == 0) {
                for (String chrName : getAllChromosomeNames()) {
                    Chromosome chr = getChromosome(chrName);
                    int length = chr.getLength();
                    if (length > maxChromoLength / 2) {
                        longChromosomeNames.add(chrName);
                    }
                }
            }
        }
        return longChromosomeNames;
    }

    public long getNominalLength() {
        if (nominalLength < 0) {
            nominalLength = 0;
            for (String chrName : getLongChromosomeNames()) {
                Chromosome chr = getChromosome(chrName);
                nominalLength += chr.getLength();
            }
        }
        return nominalLength;
    }


    // TODO A hack (
    // obviously),  we need to record a species in the genome definitions
    private static Map<String, String> ucscSpeciesMap;

    private static synchronized String getSpeciesForID(String id) {
        if (ucscSpeciesMap == null) {
            ucscSpeciesMap = new HashMap<String, String>();

            InputStream is = null;

            try {
                is = Genome.class.getResourceAsStream("speciesMapping.txt");
                BufferedReader br = new BufferedReader(new InputStreamReader(is));

                String nextLine;
                while ((nextLine = br.readLine()) != null) {
                    if (nextLine.startsWith("#")) continue;
                    String[] tokens = Globals.tabPattern.split(nextLine);
                    if (tokens.length == 2) {
                        ucscSpeciesMap.put(tokens[0], tokens[1]);
                    } else {
                        log.error("Unexpected number of tokens in species mapping file for line: " + nextLine);
                    }
                }
            } catch (IOException e) {
                log.error("Error reading species mapping table", e);
            } finally {
                if (is != null) try {
                    is.close();
                } catch (IOException e) {
                    log.error("", e);
                }
            }

        }

        for (Map.Entry<String, String> entry : ucscSpeciesMap.entrySet()) {
            if (id.startsWith(entry.getKey())) {
                return entry.getValue();
            }
        }
        return null;
    }


}
TOP

Related Classes of org.broad.igv.feature.genome.Genome

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.