/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.annotation;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import picard.annotation.Gene.Transcript;
import picard.annotation.Gene.Transcript.Exon;
import picard.util.TabbedTextFileWithHeaderParser;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Loads gene annotations from a refFlat file into an OverlapDetector<Gene>. Discards annotations that are not
* internally consistent, e.g. transcripts on different chromosomes or different strands.
*/
public class RefFlatReader {
private static final Log LOG = Log.getInstance(RefFlatReader.class);
// These are in the order that columns appear in refFlat format.
public enum RefFlatColumns{GENE_NAME, TRANSCRIPT_NAME, CHROMOSOME, STRAND, TX_START, TX_END, CDS_START, CDS_END,
EXON_COUNT, EXON_STARTS, EXON_ENDS}
private static final String[] RefFlatColumnLabels = new String[RefFlatColumns.values().length];
static {
for (int i = 0; i < RefFlatColumnLabels.length; ++i) {
RefFlatColumnLabels[i] = RefFlatColumns.values()[i].name();
}
}
private final File refFlatFile;
private final SAMSequenceDictionary sequenceDictionary;
RefFlatReader(final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
this.refFlatFile = refFlatFile;
this.sequenceDictionary = sequenceDictionary;
}
static OverlapDetector<Gene> load(final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
return new RefFlatReader(refFlatFile, sequenceDictionary).load();
}
OverlapDetector<Gene> load() {
final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);
final int expectedColumns = RefFlatColumns.values().length;
final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();
for (final TabbedTextFileWithHeaderParser.Row row : parser) {
final int lineNumber = parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
if (row.getFields().length != expectedColumns) {
throw new AnnotationException("Wrong number of fields in refFlat file " + refFlatFile + " at line " +
lineNumber);
}
final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
final String transcriptDescription = geneName + ":" + transcriptName;
final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
if (!isSequenceRecognized(chromosome)) {
LOG.debug("Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
} else {
List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
if (transcriptLines == null) {
transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
refFlatLinesByGene.put(geneName, transcriptLines);
}
transcriptLines.add(row);
}
}
int longestInterval = 0;
int numIntervalsOver1MB = 0;
for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines : refFlatLinesByGene.values()) {
try {
final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
overlapDetector.addLhs(gene, gene);
if (gene.length() > longestInterval) longestInterval = gene.length();
if (gene.length() > 1000000) ++numIntervalsOver1MB;
} catch (AnnotationException e) {
LOG.debug(e.getMessage() + " -- skipping");
}
}
LOG.debug("Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
return overlapDetector;
}
private boolean isSequenceRecognized(final String sequence) {
return (sequenceDictionary.getSequence(sequence) != null);
}
private Gene makeGeneFromRefFlatLines(final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) {
final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name());
final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name());
final boolean negative = strandStr.equals("-");
final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name());
// Figure out the extend of the gene
int start = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
for (final TabbedTextFileWithHeaderParser.Row row: transcriptLines) {
start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1);
end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name()));
}
final Gene gene = new Gene(chromosome, start, end, negative, geneName);
for (final TabbedTextFileWithHeaderParser.Row row: transcriptLines) {
if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) {
throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName);
}
if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) {
throw new AnnotationException("Chromosome disagreement(" + chromosome + " != " + row.getField(RefFlatColumns.CHROMOSOME.name()) +
") in refFlat file for gene " + geneName);
}
// This adds it to the Gene also
final Transcript tx = makeTranscriptFromRefFlatLine(gene, row);
}
return gene;
}
/**
* Conversion from 0-based half-open to 1-based inclusive intervals is done here.
*/
private Gene.Transcript makeTranscriptFromRefFlatLine(final Gene gene, final TabbedTextFileWithHeaderParser.Row row) {
final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
final String transcriptDescription = geneName + ":" + transcriptName;
final int exonCount = Integer.parseInt(row.getField(RefFlatColumns.EXON_COUNT.name()));
final String[] exonStarts = row.getField(RefFlatColumns.EXON_STARTS.name()).split(",");
final String[] exonEnds = row.getField(RefFlatColumns.EXON_ENDS.name()).split(",");
if (exonCount != exonStarts.length) {
throw new AnnotationException("Number of exon starts does not agree with number of exons for " + transcriptDescription);
}
if (exonCount != exonEnds.length) {
throw new AnnotationException("Number of exon ends does not agree with number of exons for " + transcriptDescription);
}
final int transcriptionStart = row.getIntegerField(RefFlatColumns.TX_START.name()) + 1;
final int transcriptionEnd = row.getIntegerField(RefFlatColumns.TX_END.name());
final int codingStart = row.getIntegerField(RefFlatColumns.CDS_START.name()) + 1;
final int codingEnd = row.getIntegerField(RefFlatColumns.CDS_END.name());
final Transcript tx = gene.addTranscript(transcriptName, transcriptionStart, transcriptionEnd, codingStart, codingEnd, exonCount);
for (int i = 0; i < exonCount; ++i) {
final Exon e = tx.addExon(Integer.parseInt(exonStarts[i]) + 1, Integer.parseInt(exonEnds[i]));
if (e.start >= e.end) {
throw new AnnotationException("Exon has 0 or negative extent for " + transcriptDescription);
}
if (i > 0 && tx.exons[i-1].end >= tx.exons[i].start) {
throw new AnnotationException("Exons overlap for " + transcriptDescription);
}
}
return tx;
}
}