/*
* Copyright (c) 2007-2012 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/
package org.broad.igv.data;
//~--- non-JDK imports --------------------------------------------------------
import htsjdk.samtools.seekablestream.SeekableStream;
import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackType;
import org.broad.igv.track.WindowFunction;
import org.broad.igv.ui.IGV;
import org.broad.igv.ui.util.MessageUtils;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.collections.FloatArrayList;
import org.broad.igv.util.collections.IntArrayList;
import org.broad.igv.util.stream.IGVSeekableStreamFactory;
import htsjdk.tribble.readers.AsciiLineReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Class description
*
* @author Enter your name here...
* @version Enter version here..., 08/11/11
*/
public class IGVDatasetParser {
private static Logger log = Logger.getLogger(IGVDatasetParser.class);
private ResourceLocator dataResourceLocator;
private int chrColumn = -1;
private int startColumn = -1;
private int endColumn = -1;
private int firstDataColumn = -1;
private int lastDataColumn = -1;
private int probeColumn = -1;
private boolean hasEndLocations = false;
private boolean hasCalls = false;
private Genome genome;
private IGV igv;
private int startBase = 0;
public IGVDatasetParser(ResourceLocator copyNoFile, Genome genome) {
this.dataResourceLocator = copyNoFile;
this.genome = genome;
this.igv = IGV.hasInstance() ? IGV.getInstance() : null;
}
private void setColumnDefaults() {
String tmp = (dataResourceLocator.getPath().endsWith(".txt")
? dataResourceLocator.getPath().substring(0,
dataResourceLocator.getPath().length() - 4) : dataResourceLocator.getPath()).toLowerCase();
if (tmp.endsWith(".igv")) {
chrColumn = 0;
startColumn = 1;
endColumn = 2;
probeColumn = 3;
firstDataColumn = 4;
hasEndLocations = true;
hasCalls = false;
} else if (tmp.endsWith(".xcn") || tmp.endsWith("cn") || tmp.endsWith(".snp") || tmp.endsWith(".loh")) {
probeColumn = 0;
chrColumn = 1;
startColumn = 2;
endColumn = -1;
firstDataColumn = 3;
hasEndLocations = false;
hasCalls = tmp.endsWith(".xcn") || tmp.endsWith(".snp");
} else if (tmp.endsWith(".expr")) {
//gene_id bundle_id chr left right FPKM FPKM_conf_lo FPKM_conf_hi
probeColumn = 0;
chrColumn = 2;
startColumn = 3;
endColumn = 4;
startBase = 1;
firstDataColumn = 5;
lastDataColumn = 5;
hasEndLocations = true;
} else {
// TODO -- popup dialog and ask user to define columns, and csv vs tsv?
throw new ParserException("Unknown file type: ", 0);
}
}
/*
*/
public static boolean parsableMAGE_TAB(ResourceLocator file) throws IOException {
AsciiLineReader reader = null;
try {
reader = ParsingUtils.openAsciiReader(file);
String nextLine = null;
//skip first row
reader.readLine();
//check second row for MAGE_TAB identifiers
if ((nextLine = reader.readLine()) != null && (nextLine.contains("Reporter REF") || nextLine.contains("Composite Element REF") || nextLine.contains("Term Source REF") || nextLine.contains("CompositeElement REF") || nextLine.contains("TermSource REF") || nextLine.contains("Coordinates REF"))) {
int count = 0;
// check if this mage_tab data matrix can be parsed by this class
while ((nextLine = reader.readLine()) != null && count < 5) {
nextLine = nextLine.trim();
if (nextLine.startsWith("SNP_A") || nextLine.startsWith("CN_")) {
return true;
}
count++;
}
return false;
}
} finally {
if (reader != null) {
reader.close();
}
}
return false;
}
/**
* Scan the datafile for chromosome breaks.
*
* @param dataset
* @return
*/
public List<ChromosomeSummary> scan(IGVDataset dataset) {
int estLineCount = ParsingUtils.estimateLineCount(dataResourceLocator.getPath());
Map<String, Integer> longestFeatureMap = new HashMap();
float dataMin = 0;
float dataMax = 0;
InputStream is = null;
AsciiLineReader reader = null;
String nextLine = null;
ChromosomeSummary chrSummary = null;
List<ChromosomeSummary> chrSummaries = new ArrayList();
String[] headings = null;
WholeGenomeData wgData = null;
int nRows = 0;
int headerRows = 0;
int count = 0;
boolean logNormalized;
try {
int skipColumns = hasCalls ? 2 : 1;
// BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator);
is = ParsingUtils.openInputStreamGZ(dataResourceLocator);
reader = new AsciiLineReader(is);
// Infer datatype from extension. This can be overriden in the
// comment section
if (isCopyNumberFileExt(dataResourceLocator.getPath())) {
dataset.setTrackType(TrackType.COPY_NUMBER);
dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
} else if (isLOHFileExt(dataResourceLocator.getPath())) {
dataset.setTrackType(TrackType.LOH);
dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
} else {
dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
}
// Parse comments and directives, if any
nextLine = reader.readLine();
while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) {
headerRows++;
if (nextLine.length() > 0) {
parseDirective(nextLine, dataset);
}
nextLine = reader.readLine();
}
if (chrColumn < 0) {
setColumnDefaults();
}
// Parse column headings
String[] data = nextLine.trim().split("\t");
// Set last data column
if (lastDataColumn < 0) {
lastDataColumn = data.length - 1;
}
headings = getHeadings(data, skipColumns);
dataset.setDataHeadings(headings);
// Infer if the data is logNormalized by looking for negative data values.
// Assume it is not until proven otherwise
logNormalized = false;
wgData = new WholeGenomeData(headings);
int chrRowCount = 0;
// Update
int updateCount = 5000;
long lastPosition = 0;
while ((nextLine = reader.readLine()) != null) {
if (igv != null && ++count % updateCount == 0) {
igv.setStatusBarMessage("Loaded: " + count + " / " + estLineCount + " (est)");
}
// Distance since last sample
String[] tokens = Globals.tabPattern.split(nextLine, -1);
int nTokens = tokens.length;
if (nTokens > 0) {
String thisChr = genome.getChromosomeAlias(tokens[chrColumn]);
if (chrSummary == null || !thisChr.equals(chrSummary.getName())) {
// Update whole genome and previous chromosome summary, unless this is
// the first chromosome
if (chrSummary != null) {
updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
chrSummary.setNDataPoints(nRows);
}
// Shart the next chromosome
chrSummary = new ChromosomeSummary(thisChr, lastPosition);
chrSummaries.add(chrSummary);
nRows = 0;
wgData = new WholeGenomeData(headings);
chrRowCount = 0;
}
lastPosition = reader.getPosition();
int location = -1;
try {
location = ParsingUtils.parseInt(tokens[startColumn]) - startBase;
} catch (NumberFormatException numberFormatException) {
log.error("Column " + tokens[startColumn] + " is not a number");
throw new ParserException("Column " + (startColumn + 1) +
" must contain an integer value." + " Found: " + tokens[startColumn],
count + headerRows, nextLine);
}
int length = 1;
if (hasEndLocations) {
try {
length = ParsingUtils.parseInt(tokens[endColumn].trim()) - location + 1;
} catch (NumberFormatException numberFormatException) {
log.error("Column " + tokens[endColumn] + " is not a number");
throw new ParserException("Column " + (endColumn + 1) +
" must contain an integer value." + " Found: " + tokens[endColumn],
count + headerRows, nextLine);
}
}
updateLongestFeature(longestFeatureMap, thisChr, length);
if (wgData.locations.size() > 0 && wgData.locations.get(wgData.locations.size() - 1) > location) {
throw new ParserException("File is not sorted, .igv and .cn files must be sorted by start position." +
" Use igvtools (File > Run igvtools..) to sort the file.", count + headerRows);
}
wgData.locations.add(location);
for (int idx = 0; idx < headings.length; idx++) {
int i = firstDataColumn + idx * skipColumns;
float copyNo = i < tokens.length ? readFloat(tokens[i]) : Float.NaN;
if (!Float.isNaN(copyNo)) {
dataMin = Math.min(dataMin, copyNo);
dataMax = Math.max(dataMax, copyNo);
}
if (copyNo < 0) {
logNormalized = true;
}
String heading = headings[idx];
wgData.data.get(heading).add(copyNo);
}
nRows++;
}
chrRowCount++;
}
dataset.setLongestFeatureMap(longestFeatureMap);
} catch (ParserException pe) {
throw pe;
} catch (FileNotFoundException e) {
// DialogUtils.showError("SNP file not found: " + dataSource.getCopyNoFile());
log.error("File not found: " + dataResourceLocator);
throw new RuntimeException(e);
} catch (Exception e) {
log.error("Exception when loading: " + dataResourceLocator.getPath(), e);
if (nextLine != null && (count + headerRows != 0)) {
throw new ParserException(e.getMessage(), e, count + headerRows, nextLine);
} else {
throw new RuntimeException(e);
}
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
log.error("Error closing IGVDataset stream", e);
}
}
}
// Update last chromosome
if (chrSummary != null) {
updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
chrSummary.setNDataPoints(nRows);
}
dataset.setLogNormalized(logNormalized);
dataset.setDataMin(dataMin);
dataset.setDataMax(dataMax);
return chrSummaries;
}
private void updateLongestFeature(Map<String, Integer> longestFeatureMap, String thisChr, int length) {
if (longestFeatureMap.containsKey(thisChr)) {
longestFeatureMap.put(thisChr, Math.max(longestFeatureMap.get(thisChr), length));
} else {
longestFeatureMap.put(thisChr, length);
}
}
private float readFloat(String token) {
float copyNo = Float.NaN;
try {
if (token != null) {
copyNo = Float.parseFloat(token);
}
} catch (NumberFormatException e) {
// This is an expected condition.
}
return copyNo;
}
/**
* Load data for a single chromosome.
*
* @param chrSummary
* @param dataHeaders
* @return
*/
public ChromosomeData loadChromosomeData(ChromosomeSummary chrSummary, String[] dataHeaders) {
// InputStream is = null;
try {
int skipColumns = hasCalls ? 2 : 1;
// Get an estimate of the number of snps (rows). THIS IS ONLY AN ESTIMATE
int nRowsEst = chrSummary.getNDataPts();
SeekableStream is = IGVSeekableStreamFactory.getInstance().getStreamFor(dataResourceLocator.getPath());
is.seek(chrSummary.getStartPosition());
AsciiLineReader reader = new AsciiLineReader(is);
// Create containers to hold data
IntArrayList startLocations = new IntArrayList(nRowsEst);
IntArrayList endLocations = (hasEndLocations ? new IntArrayList(nRowsEst) : null);
List<String> probes = new ArrayList(nRowsEst);
Map<String, FloatArrayList> dataMap = new HashMap();
for (String h : dataHeaders) {
dataMap.put(h, new FloatArrayList(nRowsEst));
}
// Begin loop through rows
String chromosome = chrSummary.getName();
boolean chromosomeStarted = false;
String nextLine = reader.readLine();
while ((nextLine != null) && (nextLine.trim().length() > 0)) {
if (!nextLine.startsWith("#")) {
try {
String[] tokens = Globals.tabPattern.split(nextLine, -1);
String thisChromosome = genome.getChromosomeAlias(tokens[chrColumn].trim());
if (thisChromosome.equals(chromosome)) {
chromosomeStarted = true;
// chromosomeData.setMarkerId(nRows, tokens[0]);
// The probe. A new string is created to prevent holding on to the entire row through a substring reference
String probe = new String(tokens[probeColumn]);
probes.add(probe);
int start = ParsingUtils.parseInt(tokens[startColumn].trim()) - startBase;
if (hasEndLocations) {
endLocations.add(ParsingUtils.parseInt(tokens[endColumn].trim()));
}
startLocations.add(start);
if(tokens.length <= firstDataColumn + (dataHeaders.length - 1)*skipColumns){
String msg = "Line has too few data columns: " + nextLine;
log.error(msg);
throw new RuntimeException(msg);
}
for (int idx = 0; idx < dataHeaders.length; idx++) {
int i = firstDataColumn + idx * skipColumns;
float copyNo = i <= lastDataColumn ? readFloat(tokens[i]) : Float.NaN;
String heading = dataHeaders[idx];
dataMap.get(heading).add(copyNo);
}
} else if (chromosomeStarted) {
break;
}
} catch (NumberFormatException numberFormatException) {
// Skip line
log.info("Skipping line (NumberFormatException) " + nextLine);
}
}
nextLine = reader.readLine();
}
// Loop complete
ChromosomeData cd = new ChromosomeData(chrSummary.getName());
cd.setProbes(probes.toArray(new String[]{}));
cd.setStartLocations(startLocations.toArray());
if (hasEndLocations) {
cd.setEndLocations(endLocations.toArray());
}
for (String h : dataHeaders) {
cd.setData(h, dataMap.get(h).toArray());
}
return cd;
} catch (IOException ex) {
log.error("Error parsing cn file", ex);
throw new RuntimeException("Error parsing cn file", ex);
}
}
/**
* Note: This is an exact copy of the method in ExpressionFileParser. Refactor to merge these
* two parsers, or share a common base class.
*
* @param comment
* @param dataset
*/
private void parseDirective(String comment, IGVDataset dataset) {
String tmp = comment.substring(1, comment.length());
if (tmp.startsWith("track")) {
ParsingUtils.parseTrackLine(tmp, dataset.getTrackProperties());
} else if (tmp.startsWith("columns")) {
parseColumnLine(tmp);
} else {
String[] tokens = tmp.split("=");
if (tokens.length != 2) {
return;
}
String key = tokens[0].trim().toLowerCase();
if (key.equals("name")) {
dataset.setName(tokens[1].trim());
} else if (key.equals("type")) {
try {
dataset.setTrackType(TrackType.valueOf(tokens[1].trim().toUpperCase()));
} catch (Exception exception) {
// Ignore
}
} else if (key.equals("coords")) {
startBase = Integer.parseInt(tokens[1].trim());
}
}
}
private boolean isCopyNumberFileExt(String filename) {
String tmp = ((filename.endsWith(".txt") || filename.endsWith(".tab") || filename.endsWith(".xls")
? filename.substring(0, filename.length() - 4) : filename)).toLowerCase();
return tmp.endsWith(".cn") || tmp.endsWith(".xcn") || tmp.endsWith(".snp");
}
private boolean isLOHFileExt(String filename) {
String tmp = (filename.endsWith(".txt") || filename.endsWith(".tab") || filename.endsWith(".xls")
? filename.substring(0, filename.length() - 4) : filename);
return tmp.endsWith(".loh");
}
/**
* Return the sample headings for the copy number file.
*
* @param tokens
* @param skipColumns
* @return
*/
public String[] getHeadings(String[] tokens, int skipColumns) {
return getHeadings(tokens, skipColumns, false);
}
/**
* Return the sample headings for the copy number file.
*
* @param tokens
* @param skipColumns
* @param removeDuplicates , whether to remove any duplicate headings
* @return
*/
public String[] getHeadings(String[] tokens, int skipColumns, boolean removeDuplicates) {
ArrayList headings = new ArrayList();
String previousHeading = null;
for (int i = firstDataColumn; i <= lastDataColumn; i += skipColumns) {
if (removeDuplicates) {
if (previousHeading != null && tokens[i].equals(previousHeading) || tokens[i].equals("")) {
continue;
}
previousHeading = tokens[i];
}
headings.add(tokens[i].trim());
}
return (String[]) headings.toArray(new String[0]);
}
private void parseColumnLine(String tmp) {
String[] tokens = tmp.split("\\s+");
String neg_colnum = "Error parsing column line: " + tmp + "<br>Column numbers must be > 0";
if (tokens.length > 1) {
for (int i = 1; i < tokens.length; i++) {
String[] kv = tokens[i].split("=");
if (kv.length == 2) {
if (kv[0].toLowerCase().equals("chr")) {
int c = Integer.parseInt(kv[1]);
if (c < 1) {
MessageUtils.showMessage(neg_colnum);
} else {
chrColumn = c - 1;
}
} else if (kv[0].toLowerCase().equals("start")) {
int c = Integer.parseInt(kv[1]);
if (c < 1) {
MessageUtils.showMessage(neg_colnum);
} else {
startColumn = c - 1;
}
} else if (kv[0].toLowerCase().equals("end")) {
int c = Integer.parseInt(kv[1]);
if (c < 1) {
MessageUtils.showMessage(neg_colnum);
} else {
endColumn = c - 1;
hasEndLocations = true;
}
} else if (kv[0].toLowerCase().equals("probe")) {
int c = Integer.parseInt(kv[1]);
if (c < 1) {
MessageUtils.showMessage(neg_colnum);
} else {
probeColumn = c - 1;
}
} else if (kv[0].toLowerCase().equals("data")) {
// examples 4, 4-8, 4-
String[] se = kv[1].split("-");
int c = Integer.parseInt(se[0]);
if (c < 1) {
MessageUtils.showMessage(neg_colnum);
} else {
this.firstDataColumn = c - 1;
}
if (se.length > 1) {
c = Integer.parseInt(se[1]);
if (c < 1) {
MessageUtils.showMessage(neg_colnum);
} else {
this.lastDataColumn = c - 1;
}
}
}
}
}
}
}
private void updateWholeGenome(String currentChromosome, IGVDataset dataset, String[] headings,
IGVDatasetParser.WholeGenomeData wgData) {
if (!genome.getHomeChromosome().equals(Globals.CHR_ALL)) {
return;
}
// Update whole genome data
int[] locations = wgData.locations.toArray();
if (locations.length > 0) {
Map<String, float[]> tmp = new HashMap(wgData.data.size());
for (String s : wgData.headings) {
tmp.put(s, wgData.data.get(s).toArray());
}
GenomeSummaryData genomeSummary = dataset.getGenomeSummary();
if (genomeSummary == null) {
genomeSummary = new GenomeSummaryData(genome, headings);
dataset.setGenomeSummary(genomeSummary);
}
genomeSummary.addData(currentChromosome, locations, tmp);
}
}
class WholeGenomeData {
String[] headings;
IntArrayList locations = new IntArrayList(50000);
Map<String, FloatArrayList> data = new HashMap();
WholeGenomeData(String[] headings) {
this.headings = headings;
for (String h : headings) {
data.put(h, new FloatArrayList(50000));
}
}
int size() {
return locations.size();
}
}
}