/*
* Copyright (c) 2007-2013 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/
package org.broad.igv.feature.tribble;
import org.broad.igv.Globals;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackProperties;
import org.broad.igv.track.TrackType;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.collections.MultiMap;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.exception.CodecLineParsingException;
import htsjdk.tribble.readers.LineIterator;
/**
* Basically BED format with some columns rearranged
* <p/> Columns, from UCSC documentation
* <p/>
* 0 bin 585 smallint(5) unsigned Indexing field to speed chromosome range queries.
* 1 swScore 1504 int(10) unsigned Smith Waterman alignment score
* 2 milliDiv 13 int(10) unsigned Base mismatches in parts per thousand
* 3 milliDel 4 int(10) unsigned Bases deleted in parts per thousand
* 4 milliIns 13 int(10) unsigned Bases inserted in parts per thousand
* 5 genoName chr1 varchar(255) Genomic sequence name
* 6 genoStart 10000 int(10) unsigned Start in genomic sequence
* 7 genoEnd 10468 int(10) unsigned End in genomic sequence
* 8 genoLeft -249240153 int(11) -#bases after match in genomic sequence
* 9 strand + char(1) Relative orientation + or -
* 10 repName (CCCTAA)n varchar(255) Name of repeat
* 11 repClass Simple_repeat varchar(255) Class of repeat
* 12 repFamily Simple_repeat varchar(255) Family of repeat
* 13 repStart 1 int(11) Start (if strand is +) or -#bases after match (if strand is -) in repeat sequence
* 14 repEnd 463 int(11) End in repeat sequence
* 15 repLeft 0 int(11) -#bases after match (if strand is +) or start (if strand is -) in repeat sequence
* 16 id 1 char(1) First digit of id field in RepeatMasker .out file. Best ignored.
*/
public class REPMaskCodec extends AsciiFeatureCodec<BasicFeature> {
FeatureFileHeader header;
Genome genome;
public REPMaskCodec(Genome genome) {
super(BasicFeature.class);
this.genome = genome;
}
public Object readActualHeader(LineIterator reader) {
String nextLine;
header = new FeatureFileHeader();
header.setTrackType(TrackType.REPMASK);
int nLines = 0;
try {
while (reader.hasNext()){
nextLine = reader.peek();
if( !nextLine.startsWith("#") && !nextLine.startsWith("track") &&
!nextLine.startsWith("browser") ){
break;
}
reader.next();
nLines++;
if (nextLine.startsWith("#type")) {
String[] tokens = nextLine.split("=");
if (tokens.length > 1) {
try {
header.setTrackType(TrackType.valueOf(tokens[1]));
} catch (Exception e) {
// log.error("Error converting track type: " + tokens[1]);
}
}
} else if (nextLine.startsWith("track")) {
TrackProperties tp = new TrackProperties();
ParsingUtils.parseTrackLine(nextLine, tp);
header.setTrackProperties(tp);
}
}
return header;
} catch (Exception e) {
throw new CodecLineParsingException("Error parsing header: " + e.getMessage(), e);
}
}
/**
* This function returns true iff the File potentialInput can be parsed by this
* codec.
* <p/>
* There is an assumption that there's never a situation where two different Codecs
* return true for the same file. If this occurs, the recommendation would be to error out.
* <p/>
* Note this function must never throw an error. All errors should be trapped
* and false returned.
*
* @param path the file to test for parsability with this codec
* @return true if potentialInput can be parsed, false otherwise
*/
public boolean canDecode(String path) {
return true; // Optimisitic!
}
/**
* Return an abbreviated feature, containing only location information. Used for indexing.
*
* @param line
* @return
*/
public Feature decodeLoc(String line) {
String[] tokens = Globals.singleTabMultiSpacePattern.split(line);
if(tokens.length < 15) {
return decodeLegacy(tokens);
}
String chr = genome == null ? tokens[5] : genome.getChromosomeAlias(tokens[5]);
int start = Integer.parseInt(tokens[6]);
int end = Integer.parseInt(tokens[7]);
return new BasicFeature(chr, start, end);
}
public BasicFeature decode(String nextLine) {
if (nextLine.trim().length() == 0 || nextLine.startsWith("#")) {
return null;
}
String[] tokens = Globals.singleTabMultiSpacePattern.split(nextLine);
int tokenCount = tokens.length;
if (tokenCount < 15) {
return decodeLegacy(tokens);
}
String chr = genome == null ? tokens[5] : genome.getChromosomeAlias(tokens[5]);
int start = Integer.parseInt(tokens[6]);
int end = Integer.parseInt(tokens[7]);
BasicFeature feature = new BasicFeature(chr, start, end);
String strandString = tokens[3].trim();
char strand = (strandString.length() == 0) ? ' ' : strandString.charAt(0);
if (strand == '-') {
feature.setStrand(Strand.NEGATIVE);
} else if (strand == '+') {
feature.setStrand(Strand.POSITIVE);
} else {
feature.setStrand(Strand.NONE);
}
String name = tokens[10];
feature.setName(name);
feature.setIdentifier(name);
MultiMap<String, String> attributes = new MultiMap<String, String>();
attributes.put("Smith Waterman score", tokens[1]);
attributes.put("base mismatches per thousand", tokens[2]);
attributes.put("bases deleted per thousand", tokens[3]);
attributes.put("bases inserted per thousand", tokens[4]);
attributes.put("repeat class", tokens[11]);
attributes.put("repeat family", tokens[12]);
attributes.put("repeat start", tokens[13]);
attributes.put("repeat end", tokens[14]);
feature.setAttributes(attributes);
return feature;
}
public BasicFeature decodeLegacy(String [] tokens) {
// The first 3 columns are non optional for BED. We will relax this
// and only require 2.
int tokenCount = tokens.length;
if (tokenCount < 2) {
return null;
}
String chr = genome == null ? tokens[0] : genome.getChromosomeAlias(tokens[0]);
int start = Integer.parseInt(tokens[1]);
int end = start + 1;
if (tokenCount > 2) {
end = Integer.parseInt(tokens[2]);
}
BasicFeature feature = new BasicFeature(chr, start, end);
// The rest of the columns are optional. Stop parsing upon encountering
// a non-expected value
// Strand
if (tokenCount > 3) {
String strandString = tokens[3].trim();
char strand = (strandString.length() == 0) ? ' ' : strandString.charAt(0);
if (strand == '-') {
feature.setStrand(Strand.NEGATIVE);
} else if (strand == '+') {
feature.setStrand(Strand.POSITIVE);
} else {
feature.setStrand(Strand.NONE);
}
}
// Name
if (tokenCount > 4) {
String name = tokens[4].replaceAll("\"", "");
feature.setName(name);
feature.setIdentifier(name);
}
return feature;
}
}