/*
* Copyright (c) 2007-2013 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.broad.igv.tools.parsers;
import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackType;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.readers.AsciiLineReader;
import java.io.IOException;
import java.util.Set;
/**
* @author jrobinso
*/
public class CNParser extends AbstractParser {
static private Logger log = Logger.getLogger(CNParser.class);
private static int PROBE_COL = 0;
int skipColumns;
// State variables. This is a serial type parser, these variables are used to hold temporary
// state.
private String chr;
String lastChr = "";
int lastPosition = 0;
private double minValue;
private double maxValue;
ResourceLocator resourceLocator;
Set<String> unsortedChromosomes;
private int chrColumn;
private int startColumn;
private int endColumn;
private int probeColumn;
private int firstDataColumn;
private boolean hasEndLocations;
private FileType type;
Genome genome;
private boolean hasCalls;
enum FileType {
IGV, XCN, SNP, CN
}
public CNParser(String file, DataConsumer dataConsumer, Genome genome) {
this(new ResourceLocator(file), dataConsumer, genome);
}
/**
*
*/
public CNParser(ResourceLocator locator, DataConsumer dataConsumer, Genome genome) {
super(dataConsumer);
this.resourceLocator = locator;
this.genome = genome;
String tmp = locator.getPath().toLowerCase();
tmp = tmp.endsWith(".txt") ? tmp.substring(0, tmp.length() - 4) : tmp;
hasCalls = tmp.endsWith(".xcn") || tmp.endsWith(".snp");
if (tmp.endsWith(".igv")) {
chrColumn = 0;
startColumn = 1;
endColumn = 2;
probeColumn = 3;
firstDataColumn = 4;
hasEndLocations = true;
hasCalls = false;
type = FileType.IGV;
setTrackType(TrackType.OTHER);
} else {
probeColumn = 0;
chrColumn = 1;
startColumn = 2;
endColumn = -1;
firstDataColumn = 3;
hasEndLocations = false;
hasCalls = tmp.endsWith(".xcn") || tmp.endsWith(".snp");
if (tmp.endsWith(".cn")) {
type = FileType.CN;
} else if (tmp.endsWith(".xcn")) {
type = FileType.XCN;
} else {
type = FileType.SNP;
}
}
skipColumns = hasCalls ? 2 : 1;
}
public void parseHeader(String[] tokens) {
int nDataColumns = (tokens.length - firstDataColumn) / skipColumns;
String[] headings = new String[nDataColumns];
for (int i = firstDataColumn; i < tokens.length; i += skipColumns) {
int idx = (i - firstDataColumn) / skipColumns;
headings[idx] = tokens[i];
}
setHeadings(headings);
}
/**
* @return
*/
public void parse() throws IOException {
AsciiLineReader reader = null;
try {
lastPosition = 0;
reader = ParsingUtils.openAsciiReader(resourceLocator);
String nextLine = null;
// Infer datatype from extension. This can be overriden in the
// comment section
if (isCopyNumberFileExt(resourceLocator.getPath())) {
setTrackType(TrackType.COPY_NUMBER);
} else if (isLOHFileExt(resourceLocator.getPath())) {
setTrackType(TrackType.LOH);
}
// Parse comments, if any
nextLine = reader.readLine();
while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) {
if (nextLine.length() > 0) {
parseComment(nextLine);
}
nextLine = reader.readLine();
}
parseHeader(nextLine.trim().split("\t"));
setTrackParameters();
float[] dataArray = new float[getHeadings().length];
while ((nextLine = reader.readLine()) != null && (nextLine.trim().length() > 0)) {
String[] tokens = Globals.singleTabMultiSpacePattern.split(nextLine);
int nTokens = tokens.length;
if (nTokens == 0) {
continue;
}
try {
chr = (genome == null ? tokens[chrColumn] : genome.getChromosomeAlias(tokens[chrColumn]));
if (!chr.equals(lastChr)) {
newChromosome();
}
lastChr = chr;
int startPosition = ParsingUtils.parseInt(tokens[startColumn].trim());
if (startPosition < lastPosition) {
throw new UnsortedException("Error: unsorted file. .cn files must be sorted by genomic position.");
}
lastPosition = startPosition;
int endPosition = hasEndLocations ? ParsingUtils.parseInt(tokens[endColumn].trim()) : startPosition + 1;
// TODO -- compare nTokens with expected number
for (int i = firstDataColumn; i < nTokens; i += skipColumns) {
int idx = (i - firstDataColumn) / skipColumns;
try {
dataArray[idx] = Float.parseFloat(tokens[i].trim());
} catch (NumberFormatException numberFormatException) {
dataArray[idx] = Float.NaN;
}
}
String probe = tokens[probeColumn];
getDataConsumer().addData(chr, startPosition, endPosition, dataArray, probe);
} catch (NumberFormatException e) {
log.error("Error parsing number in: " + nextLine + "\n" + e.getMessage(), e);
}
}
parsingComplete();
} catch (Exception e) {
log.error(e.getMessage(), e);
} finally {
if (reader != null) {
reader.close();
}
}
}
/**
* Method description
*
* @return
*/
public double getMinValue() {
return minValue;
}
/**
* Method description
*
* @return
*/
public double getMaxValue() {
return maxValue;
}
private void newChromosome() {
//getDataConsumer().newChromosome(chr);
lastPosition = -1;
}
private void parsingComplete() {
getDataConsumer().parsingComplete();
}
private boolean isCopyNumberFileExt(String filename) {
String tmp = (filename.endsWith(".txt") || filename.endsWith(".tab") || filename.endsWith(".xls")
? filename.substring(0, filename.length() - 4) : filename);
return tmp.endsWith(".cn") || tmp.endsWith(".xcn") || tmp.endsWith(".snp");
}
private boolean isLOHFileExt(String filename) {
String tmp = (filename.endsWith(".txt") || filename.endsWith(".tab") || filename.endsWith(".xls")
? filename.substring(0, filename.length() - 4) : filename);
return tmp.endsWith(".loh");
}
}