package org.broad.igv.feature.tribble;
import htsjdk.samtools.util.LocationAware;
import org.apache.log4j.Logger;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Exon;
import org.broad.igv.feature.Strand;
import htsjdk.tribble.AbstractFeatureCodec;
import htsjdk.tribble.FeatureCodecHeader;
import htsjdk.tribble.readers.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
/**
* @author jrobinso
* Date: 11/15/13
* Time: 1:08 PM
*/
public class EMBLTableCodec extends AbstractFeatureCodec<BasicFeature, EMBLTableCodec.EmblTableIterator> {
private static Logger log = Logger.getLogger(EMBLTableCodec.class);
public EMBLTableCodec() {
super(BasicFeature.class);
}
@Override
public BasicFeature decode(EmblTableIterator s) throws IOException {
EmblRecord emblRecord = s.next();
if(emblRecord == null) return null;
BasicFeature feature = new BasicFeature(emblRecord.getChromosome(), emblRecord.getStart(),
emblRecord.getEnd());
feature.setType(emblRecord.getType());
feature.setIdentifier(emblRecord.getIdentifier());
feature.setName(emblRecord.getIdentifier());
feature.setStrand(emblRecord.getStrand());
feature.setDescription(emblRecord.getDescription());
if (emblRecord.getAlias() != null) {
feature.setName(emblRecord.getAlias());
}
// If this is a "gene part" add the exons
for (Exon exon : emblRecord.getExons()) {
feature.addExon(exon);
}
return feature;
}
@Override
public FeatureCodecHeader readHeader(EmblTableIterator o) throws IOException {
return null;
}
/**
* Generates a reader appropriate for use by this codec from the generic input stream. Implementers should
* assume the stream is buffered.
*/
@Override
public EmblTableIterator makeSourceFromStream(InputStream inputStream) {
return new EmblTableIterator(inputStream);
}
/**
* Generates a {@link LocationAware} reader of type EmblTableIterator. Like {@link #makeSourceFromStream(java.io.InputStream)}, except
* the {@link LocationAware} compatibility is required for creating indexes.
* <p/>
* Implementers of this method must return a type that is both {@link LocationAware} as well as EmblTableIterator. Note that this
* requirement cannot be enforced via the method signature due to limitations in Java's generic typing system. Instead, consumers
* should cast the call result into a EmblTableIterator when applicable.
*/
@Override
public LocationAware makeIndexableSourceFromStream(InputStream inputStream) {
return null; // <= indexing will fail until this is implemented.
}
@Override
public boolean isDone(EmblTableIterator o) {
return !o.hasNext();
}
@Override
public void close(EmblTableIterator o) {
o.close();
}
static class EmblRecord {
private static Logger log = Logger.getLogger(EmblRecord.class);
boolean isNegative;
private String type;
private String chromosome;
private String identifier;
private String alias;
private String description;
private int start = Integer.MAX_VALUE;
private int end;
List<Exon> exons;
EmblRecord(String type, String chromosome, String lociString, boolean isNegative) {
this.isNegative = isNegative;
this.type = type;
this.chromosome = chromosome;
createExons(lociString, isNegative);
}
/**
* Method description
*
* @return
*/
public int getStart() {
return start;
}
/**
* Method description
*
* @return
*/
public int getEnd() {
return end;
}
/**
* Method description
*
* @return
*/
public boolean isGenePart() {
return type.equals("CDS") || type.equals("3'UTR") || type.equals("5'UTR");
}
/**
* Method description
*
* @return
*/
public Strand getStrand() {
return isNegative ? Strand.NEGATIVE : Strand.POSITIVE;
}
/**
* Method description
*
* @return
*/
public String getType() {
return type;
}
/**
* Method description
*
* @return
*/
public String getIdentifier() {
return identifier;
}
/**
* Method description
*
* @param identifier
*/
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
/**
* Method description
*
* @return
*/
public String getAlias() {
return alias;
}
/**
* Method description
*
* @param alias
*/
public void setAlias(String alias) {
this.alias = alias;
}
/**
* Method description
*
* @return
*/
public List<Exon> getExons() {
return exons;
}
/**
* Method description
*
* @param nextLine
*/
public void append(String nextLine) {
String attrString = nextLine.substring(21);
if (attrString.startsWith("/gene=")) {
String[] kv = attrString.split("=");
String geneName = kv[1].replace("\"", "");
if (geneName.startsWith("SP")) {
// Some genes have multiple identifiers. Only use the first one
if (getIdentifier() == null) {
setIdentifier(geneName);
}
} else {
setAlias(geneName);
}
} else if (attrString.startsWith("/systematic_id=")) {
String[] kv = attrString.split("=");
String id = kv[1].replace("\"", "");
setIdentifier(id);
setAlias(id);
} else {
appendToDescription(nextLine.substring(22).trim());
}
}
/**
* Method description
*
* @param note
*/
public void appendToDescription(String note) {
if (description == null) {
description = note;
} else {
description += "<br>" + note;
}
}
/**
* Method description
*
* @return
*/
public String getDescription() {
return description;
}
/**
* Create a list of Exon objects from the Embl join string. Apparently exons in embl
* format are represented by a single CDS record.
*
* @param joinString
* @param isNegative
*/
void createExons(String joinString, boolean isNegative) {
String[] lociArray = joinString.split(",");
exons = new ArrayList(lociArray.length);
for (String loci : lociArray) {
try {
String[] tmp = loci.split("\\.\\.");
int exonStart = Integer.parseInt(tmp[0]) - 1; // - (isNegative ? 0 : 1);
int exonEnd = exonStart + 1;
if (tmp.length > 1) {
exonEnd = Integer.parseInt(tmp[1]);
}
Strand strand = isNegative ? Strand.NEGATIVE : Strand.POSITIVE;
Exon r = new Exon(chromosome, exonStart, exonEnd, strand);
start = Math.min(start, exonStart);
end = Math.max(end, exonEnd);
exons.add(r);
} catch (NumberFormatException e) {
log.error("Error parsing exon number; " + joinString, e);
}
}
}
/**
* Method description
*
* @return
*/
public String getChromosome() {
return chromosome;
}
}
static class EmblTableIterator {
String chromosome;
BufferedReader reader;
PositionalBufferedStream is;
EmblRecord currentRecord = null;
EmblTableIterator(InputStream stream) {
if(stream instanceof PositionalBufferedStream) {
is = (PositionalBufferedStream) stream;
}
else {
is = new PositionalBufferedStream(stream);
}
reader = new BufferedReader(new InputStreamReader(stream));
// Advance to first record
next();
}
boolean hasNext() {
return currentRecord != null;
}
EmblRecord next() {
String nextLine = null;
try {
while ((nextLine = reader.readLine()) != null) {
if (nextLine.startsWith("ID")) // Chromosome change
{
String chr = getFirstWord(nextLine.substring(2));
chromosome = chr.replace("chromosome", "chr").replace("_", "");
} else if (nextLine.startsWith("FT")) {
String featureKey = nextLine.substring(5, 19).trim();
if (featureKey.length() == 0) {
if (currentRecord != null) {
currentRecord.append(nextLine);
}
} else {
// New feature started.
EmblRecord returnValue = currentRecord;
String temp = nextLine.substring(21);
boolean isNegative = temp.contains("complement");
String lociString = parseJoinString(temp, reader).replace("<",
"").replace(">", "").trim();
currentRecord = new EmblRecord(featureKey.trim(), chromosome, lociString, isNegative);
return returnValue;
}
} else {
// Skip line
}
}
return currentRecord;
} catch (IOException ex) {
log.error("Error parsing EMBL file", ex);
return null;
}
}
private String getFirstWord(String string) {
String trimmedString = string.trim();
char[] chars = trimmedString.toCharArray();
int whitespaceIndex = 0;
for (whitespaceIndex = 0; whitespaceIndex < chars.length; whitespaceIndex++) {
if (Character.isSpaceChar(chars[whitespaceIndex])) {
break;
}
}
return trimmedString.substring(0, whitespaceIndex).trim();
}
/**
* FT CDS join(complement(5000933..5001976),
* FT complement(5000325..5000891),complement(5000024..5000272))
* FT /product="GTPase activating protein (predicted)"
* FT /gene="SPAC1952.17c"
* FT /gene="SPAC890.01c"
*
* @param joinString
* @param reader
* @return
* @throws IOException
*/
public String parseJoinString(String joinString, BufferedReader reader)
throws IOException {
if (joinString.startsWith("join") || joinString.startsWith("complement")) {
int leftParenCount = countChar(joinString, '(');
int rightParenCount = countChar(joinString, ')');
while (leftParenCount != rightParenCount) {
joinString += reader.readLine().replace("FT", "").trim();
leftParenCount = countChar(joinString, '(');
rightParenCount = countChar(joinString, ')');
}
// join and complement functions irrelevant
joinString = joinString.replace("join", "");
joinString = joinString.replace("complement", "");
joinString = joinString.replace("(", "");
joinString = joinString.replace(")", "");
joinString = joinString.replace('<', ' ');
return joinString;
} else {
return joinString;
}
}
/**
* This must exist in the jdk ?
*
* @param string
* @return
*/
static int countChar(String string, char c) {
int cnt = 0;
for (int i = 0; i < string.length(); i++) {
if (c == string.charAt(i)) {
cnt++;
}
}
return cnt;
}
public void close() {
if (reader != null) try {
reader.close();
} catch (IOException e) {
log.error("Error closing EMBL reader", e);
}
}
}
}