/*
* Copyright (c) 2007-2012 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/
package org.broad.igv.track;
import org.broad.igv.AbstractHeadlessTest;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Exon;
import org.broad.igv.feature.GFFParser;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.feature.tribble.GFFCodec;
import org.broad.igv.feature.tribble.TribbleIndexNotFoundException;
import org.broad.igv.tools.IgvTools;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.TestUtils;
import htsjdk.tribble.Feature;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;
import static org.junit.Assert.*;
/**
* User: jacob
* Date: 2012-Jun-22
*/
public class GFFFeatureSourceTest extends AbstractHeadlessTest {
@Test
public void testIsGFF() {
List<String> exts = Arrays.asList("gff3", "gvf", "gff", "gtf");
for (String ext : exts) {
String file = "test." + ext;
assertTrue(GFFFeatureSource.isGFF(file));
file += ".txt";
assertTrue(GFFFeatureSource.isGFF(file));
file += ".gz";
assertTrue(GFFFeatureSource.isGFF(file));
}
}
private List<Feature> getGeneFeatures(String filepath, String chr, int start, int end) throws Exception {
TestUtils.createIndex(filepath);
GFFFeatureSource source = getGffFeatureSource(filepath);
Iterator<Feature> feats = source.getFeatures(chr, start, end);
List<Feature> sourceFeats = new ArrayList<Feature>(2);
while (feats.hasNext()) {
Feature feat = feats.next();
assertEquals(chr, feat.getChr());
sourceFeats.add(feat);
}
return sourceFeats;
}
private GFFFeatureSource getGffFeatureSource(String filepath) throws IOException, TribbleIndexNotFoundException {
TribbleFeatureSource fs = TribbleFeatureSource.getFeatureSource(new ResourceLocator(filepath), genome);
return new GFFFeatureSource(fs);
}
@Test
public void testGetAll() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/gene.sorted.gff3";
String chr = "chr1";
int start = 0;
int end = Integer.MAX_VALUE / 2;
List<Feature> sourceFeats = getGeneFeatures(filepath, chr, start, end);
assertEquals(2, sourceFeats.size());
GFFFeatureSource source = getGffFeatureSource(filepath);
Iterator<Feature> iter = source.getFeatures(chr, start, end);
List<Feature> parserFeats = new ArrayList();
while (iter.hasNext()) parserFeats.add(iter.next());
assertEquals(parserFeats.size(), sourceFeats.size());
int sF = 0;
for (Feature f : parserFeats) {
BasicFeature sourceFeat = (BasicFeature) sourceFeats.get(sF);
BasicFeature bf = (BasicFeature) f;
assertEquals(bf.getExonCount(), sourceFeat.getExonCount());
assertEquals(bf.getIdentifier(), sourceFeat.getIdentifier());
sF++;
}
}
@Test
public void testQuery_01() throws Exception {
genome = IgvTools.loadGenome(TestUtils.DATA_DIR + "genomes/hg18_truncated_aliased.genome");
String filepath = org.broad.igv.util.TestUtils.DATA_DIR + "gff/aliased.sorted.gff";
String chr = "chr5";
int start = 120960 - 1;
int end = 125258;
List<Feature> features = getGeneFeatures(filepath, chr, start, end);
int geneCount = 0;
int rnaCount = 0;
for (Feature feat : features) {
//Feature feat = features.next();
assertEquals(chr, feat.getChr());
BasicFeature bf = (BasicFeature) feat;
String id = bf.getIdentifier().toLowerCase();
if (id.contains("gene")) geneCount++;
if (id.contains("rna")) rnaCount++;
if ("gene21".equals(id)) {
assertEquals(0, bf.getExonCount());
assertEquals("gene", bf.getType());
}
if ("rna22".equals(id)) {
assertEquals(6, bf.getExonCount());
assertEquals("mRNA", bf.getType());
}
}
assertEquals(2, geneCount);
assertEquals(2, rnaCount);
}
@Test
public void testPhaseString() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/gene.sorted.gff3";
String chr = "chr1";
int start = 0;
int end = Integer.MAX_VALUE / 2;
List<Feature> sourceFeats = getGeneFeatures(filepath, chr, start, end);
String hasPhaseId = "LOC_Os01g01010.1:exon_1";
int expPhaseNum = 1;
boolean checkedHasPhase = false;
BasicFeature bf = (BasicFeature) sourceFeats.get(0);
for (Exon exon : bf.getExons()) {
if (exon.getName().equals(hasPhaseId)) {
checkedHasPhase = true;
assertEquals(expPhaseNum, exon.getReadingFrame());
} else {
assertEquals(-1, exon.getReadingFrame());
}
}
assertTrue(checkedHasPhase);
}
/**
* Test the canonical EDEN sample file as described at http://www.sequenceontology.org/gff3.shtml
*
* @throws Exception
*/
@Test
public void testEDENSample() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/canonical.eden.sorted.gff3";
String chr = "chr1";
int start = 1000 - 1;
int end = 10000;
List<Feature> features = getGeneFeatures(filepath, chr, start, end);
assertEquals(6, features.size());
/**
* We split different coding sequences / alternative translations as different features
*/
int expmRNAFeats = 4;
int actmRNAFeats = 0;
List<String> expUniquemRNAIDs = Arrays.asList("mRNA00001", "mRNA00002", "mRNA00003");
Set<String> mRNAIds = new HashSet<String>(expUniquemRNAIDs.size());
/*
chr1 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001;Name=EDEN.3
chr1 . exon 1300 1500 . + . ID=exon00001;Parent=mRNA00003
chr1 . exon 3000 3902 . + . ID=exon00003;Parent=mRNA00001,mRNA00003
chr1 . CDS 3301 3902 . + 0 ID=cds00003;Parent=mRNA00003;Name=edenprotein.3
chr1 . CDS 3391 3902 . + 0 ID=cds00004;Parent=mRNA00003;Name=edenprotein.4
chr1 . CDS 5000 5500 . + 1 ID=cds00003;Parent=mRNA00003;Name=edenprotein.3
chr1 . CDS 5000 5500 . + 1 ID=cds00004;Parent=mRNA00003;Name=edenprotein.4
chr1 . CDS 7000 7600 . + 1 ID=cds00003;Parent=mRNA00003;Name=edenprotein.3
chr1 . CDS 7000 7600 . + 1 ID=cds00004;Parent=mRNA00003;Name=edenprotein.4
*/
Set<Integer> mRNA3CdStarts = new HashSet<Integer>(Arrays.asList(3300, 3390));
for (Feature feature : features) {
BasicFeature bf = (BasicFeature) feature;
String ident = bf.getIdentifier();
List<Exon> exons = bf.getExons();
String type = bf.getType();
if (type.equals("mRNA")) {
actmRNAFeats++;
mRNAIds.add(bf.getIdentifier());
} else if (type.equals("gene")) {
assertEquals(bf.getIdentifier(), "gene00001");
continue;
} else {
continue;
}
Exon lastExon = exons.get(exons.size() - 1);
assertEquals(7600, lastExon.getCdEnd());
assertEquals(lastExon.getCdStart(), lastExon.getStart());
assertEquals(9000, lastExon.getEnd());
int midCDSInd = 2;
if (ident.equals("mRNA00001")) {
assertEquals(4, bf.getExonCount());
assertEquals(1201 - 1, exons.get(0).getCdStart());
Exon secExon = exons.get(1);
assertWholeExonCoding(secExon);
assertEquals(3000 - 1, secExon.getStart());
assertEquals(3902, secExon.getEnd());
}
if (ident.equals("mRNA00002")) {
assertEquals(3, bf.getExonCount());
assertEquals(1201 - 1, exons.get(0).getCdStart());
midCDSInd = 1;
}
if (ident.equals("mRNA00003")) {
assertEquals(4, bf.getExonCount());
boolean passedCdStart = false;
for (Exon exon : exons) {
if (exon.isNonCoding()) {
assertEquals("Entire exon is UTR but has coding region: " + exon.getName(), 0, exon.getCodingLength());
assertEquals("Entire exon is UTR but has coding region: " + exon.getName(), 0, exon.getCdEnd() - exon.getCdEnd());
} else {
//There are two coding sequences which differ only in start position
if (!passedCdStart) {
int cdStart = exon.getCdStart();
assertTrue("Exon cdStart not expected, was " + cdStart, mRNA3CdStarts.contains(cdStart));
mRNA3CdStarts.remove(cdStart);
passedCdStart = true;
}
}
assertTrue(exon.getName().contains("exon0000"));
}
assertTrue(passedCdStart);
}
Exon midCDS = exons.get(midCDSInd);
assertEquals(4999, midCDS.getStart());
assertEquals(5500, midCDS.getEnd());
assertEquals(midCDS.getStart(), midCDS.getCdStart());
assertEquals(midCDS.getEnd(), midCDS.getCdEnd());
}
assertEquals(0, mRNA3CdStarts.size());
assertEquals(expmRNAFeats, actmRNAFeats);
assertEquals(expUniquemRNAIDs.size(), mRNAIds.size());
for (String expUniquemRNAID : expUniquemRNAIDs) {
assertTrue("Expected mRNA id not found in file: " + expUniquemRNAID, mRNAIds.contains(expUniquemRNAID));
}
}
/**
* Test a GFF file which has CDS features, but no features of type "exon"
*
* @throws Exception
*/
@Test
public void testNoExons() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/NC_009084.gff";
String chr = "NC_009084.1";
int start = 0;
int end = 11302;
List<Feature> features = getGeneFeatures(filepath, chr, start, end);
assertEquals(6, features.size());
for (Feature feat : features) {
BasicFeature basicFeature = (BasicFeature) feat;
if (basicFeature.getType().equals("region")) {
assertEquals("id0", basicFeature.getIdentifier());
} else if (basicFeature.getType().equals("gene")) {
assertEquals(1, basicFeature.getExonCount());
assertTrue(basicFeature.getIdentifier().contains("gene"));
assertTrue(basicFeature.getName().contains("A1S_"));
Exon exon = basicFeature.getExons().get(0);
assertTrue("Exon name incorrect: " + exon.getName(), exon.getName().contains("YP_00"));
assertWholeExonCoding(exon);
assertEquals(0, exon.getReadingFrame());
} else {
throw new AssertionError("Unknown feature type: " + basicFeature.getType());
}
}
}
@Test
public void testtRNA() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/musa_trna.gff3";
String chr = "chr1";
int start = 26766;
int end = 26848;
List<Feature> features = getGeneFeatures(filepath, chr, start, end);
assertEquals(2, features.size());
BasicFeature gene = null, tRNA = null;
for (Feature feat : features) {
if (((BasicFeature) feat).getType().equals("gene")) {
gene = (BasicFeature) feat;
} else if (((BasicFeature) feat).getType().equals("tRNA")) {
tRNA = (BasicFeature) feat;
}
}
assertEquals(gene.getIdentifier(), tRNA.getAttributes().get("Parent"));
assertEquals(1, tRNA.getExonCount());
Exon exon = tRNA.getExons().get(0);
assertEquals(tRNA.getIdentifier(), exon.getAttributes().get("Parent"));
assertWholeExonNonCoding(exon);
}
@Test
public void testMusa_GSMUA_Achr1G00030_001() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/musa_trna.gff3";
String chr = "chr1";
int start = 20900;
int end = 26317;
List<Feature> features = getGeneFeatures(filepath, chr, start, end);
assertEquals(3, features.size());
for (Feature feat : features) {
BasicFeature bf = (BasicFeature) feat;
if (bf.getType().equals("mRNA")) {
assertEquals(8, bf.getExonCount());
for (Exon exon : bf.getExons()) {
assertEquals(bf.getIdentifier(), exon.getAttributes().get("Parent"));
assertWholeExonCoding(exon);
}
}
}
}
/**
* Test a simple description of an mRNA containing a 5'utr and cds in GFF-2. This was submitted with a bug report,
* the utr was being excluded
*
* @throws Exception
*/
@Test
public void testUTR() throws Exception {
String file = TestUtils.DATA_DIR + "gff/utr.gff";
BufferedReader reader = null;
Genome genome = null;
try {
FeatureSource source =
new GFFFeatureSource(TribbleFeatureSource.getFeatureSource(new ResourceLocator(file), genome));
Iterator<Feature> iter = source.getFeatures("chr1", 0, Integer.MAX_VALUE);
List<htsjdk.tribble.Feature> features = new ArrayList<Feature>();
while (iter.hasNext()) {
features.add(iter.next());
}
assertEquals(1, features.size());
BasicFeature feature = (BasicFeature) features.get(0);
List<Exon> exons = feature.getExons();
assertEquals(2, exons.size());
Exon firstExon = exons.get(0);
assertTrue(firstExon.isNonCoding());
assertEquals(11783, firstExon.getStart());
assertEquals(12157, firstExon.getEnd());
assertEquals(12157, firstExon.getCdStart());
Exon secondExon = exons.get(1);
assertFalse(secondExon.isNonCoding());
assertEquals(12157, secondExon.getCdStart());
assertEquals(12157, secondExon.getStart());
assertEquals(12994, secondExon.getEnd());
} finally {
if (reader != null) reader.close();
}
}
/**
* Test reconstructions of a simple mRNA from a gff3 file. This test added for a bug caused by using a GFF2
* codec with a gff3 file. IGV allows specification of gff3 by either the directive, or file extension.
*
* @throws Exception
*/
@Test
public void testRNA() throws Exception {
String filepath = TestUtils.DATA_DIR + "gff/rna1.gff3";
BufferedReader reader = null;
Genome genome = null;
try {
GFFFeatureSource source = getGffFeatureSource(filepath);
Iterator<Feature> iter = source.getFeatures("chr1", 0, Integer.MAX_VALUE);
int featureCount = 0;
Feature feature = null;
while (iter.hasNext()) {
feature = iter.next();
featureCount++;
}
assertEquals(1, featureCount);
List<Exon> exons = ((BasicFeature) feature).getExons();
assertEquals(6, exons.size());
} finally {
if (reader != null) reader.close();
}
}
private void assertWholeExonCoding(Exon exon) {
assertEquals(exon.getCdStart(), exon.getStart());
assertEquals(exon.getCdEnd(), exon.getEnd());
assertFalse(exon.isNonCoding());
}
private void assertWholeExonNonCoding(Exon exon) {
assertEquals(exon.getEnd(), exon.getCdStart());
assertEquals(0, exon.getCodingLength());
assertTrue(exon.isNonCoding());
}
}