package gem;
import gem.parser.TabDelimitedFileParser;
import gem.util.FileUtil;
import gem.util.Summary;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
/**
* @author Ozgun Babur
*/
public class CrossPlatformMapper implements Constants
{
public static Map<String, Gene> fetchGenes(Collection<String> ids, String filename) throws Throwable
{
Map<String, double[]> sourceMap = readCrossPlatExp(filename, null);
Map<String, List<Gene>> geneMap = new HashMap<String, List<Gene>>();
for (String id : ids)
{
String sym = Triplet.getGeneToSymbolMap().get(id);
if (Triplet.getSymbolToGeneMap().containsKey(id))
{
sym = id;
id = Triplet.getSymbolToGeneMap().get(id);
}
if (sourceMap.containsKey(id) || (sym != null && sourceMap.containsKey(sym)))
{
if (!geneMap.containsKey(id)) geneMap.put(id, new ArrayList<Gene>());
double[] val = sourceMap.get(id);
if (val == null) val = sourceMap.get(sym);
if (sym == null) sym = id;
Gene gene = new Gene(id, sym, val.length);
gene.value = val;
geneMap.get(id).add(gene);
}
}
Map<String, Gene> resultMap = new HashMap<String, Gene>();
for (List<Gene> geneList : geneMap.values())
{
// double maxvar = -1;
// Gene maxGene = null;
//
// for (Gene gene : geneList)
// {
// double var = gene.calcVariance();
// if (var > maxvar)
// {
// maxvar = var;
// maxGene = gene;
// }
// }
// assert maxGene != null;
//
// resultMap.put(maxGene.geneid, maxGene);
// Gene gene = getAverage(geneList);
assert geneList.size() == 1; // so below getmax code is unfunctional
Gene gene = getMax(geneList);
resultMap.put(gene.geneid, gene);
}
return resultMap;
}
public static Map<String, Gene> fetchGenes(Set<String> ids, String filename, String platform) throws Throwable
{
Map<String, List<double[]>> id2vals = readCross(platform, filename, ids);
Map<String, double[]> id2val = selectMostVaried(id2vals);
Map<String, Gene> id2gene = convertValuesToGenes(id2val);
return id2gene;
}
private static Gene getAverage(List<Gene> genes)
{
assert !genes.isEmpty();
if (genes.size() == 1) return genes.get(0);
Gene first = genes.get(0);
Gene gene = new Gene(first.geneid, ".|.", first.value.length);
for (int i = 0; i < gene.value.length; i++)
{
double avg = 0;
for (Gene g : genes)
{
avg += g.value[i];
}
avg /= genes.size();
gene.value[i] = avg;
}
return gene;
}
private static Gene getMax(List<Gene> genes)
{
assert !genes.isEmpty();
if (genes.size() == 1) return genes.get(0);
double max = -1e10;
Gene maxG = null;
for (Gene gene : genes)
{
double sum = 0;
for (int i = 0; i < gene.value.length; i++)
{
sum += gene.value[i];
}
if (sum > max)
{
max = sum;
maxG = gene;
}
}
return maxG;
}
public static Map<String, double[]> readCrossPlatExp(String filename, Set<String> ids) throws Throwable
{
// Optimized for niki's data. Should be updated for other formats.
Map<String, List<double[]>> map = new HashMap<String, List<double[]>>();
BufferedReader reader = new BufferedReader(new FileReader(filename));
for (String line = reader.readLine(); line != null; line = reader.readLine())
{
if (line.startsWith("#") || line.startsWith("!") || line.startsWith("^") ||
line.length() == 0 || line.startsWith("\"G") ||
(line.startsWith("\"I") && !line.startsWith("\"ILMN")) ||
line.startsWith("\"A") || line.startsWith("ID")) continue;
String id = line.substring(0, line.indexOf("\t"));
if (id.startsWith("\"")) id = id.substring(1, id.lastIndexOf("\""));
if (ids != null && !ids.contains(id)) continue;
if (line.startsWith("\""))
{
line = line.substring(line.indexOf("\"\t") + 2);
}
else
{
line = line.substring(line.indexOf("\t") + 1);
}
if (line.startsWith("\"")) line = line.substring(1, line.lastIndexOf("\""));
// I am not handling missing values anymore
// if (line.endsWith("\t")) line = line + "null";
String[] tokens = line.split("\t");
double[] val = new double[tokens.length];
for (int i = 0; i < tokens.length; i++)
{
if (tokens[i].equals("null") || tokens[i].equals(""))
{
val[i] = Double.NaN;
}
else
{
val[i] = Double.parseDouble(tokens[i]);
}
}
if (!map.containsKey(id)) map.put(id, new ArrayList<double[]>());
map.get(id).add(val);
}
reader.close();
Map<String, double[]> result = new HashMap<String,double[]>();
for (String id : map.keySet())
{
result.put(id, chooseOne(map.get(id)));
}
return result;
}
private static double[] chooseOne(List<double[]> list)
{
double max = -1e10;
double[] chosen = null;
for (double[] v : list)
{
double var = Summary.variance(v);
if (var > max)
{
max = var;
chosen = v;
}
}
return chosen;
}
public static void associateAndClean(List<Triplet> trips, String filename, String platform) throws Throwable
{
Set<String> genes = getIDs(trips);
Map<String, List<double[]>> id2vals = readCross(platform, filename, genes);
Map<String, double[]> id2val = selectMostVaried(id2vals);
Map<String, Gene> id2gene = convertValuesToGenes(id2val);
if (!id2gene.containsKey("367"))
{
Gene ar = new Gene("367", "AR", id2val.values().iterator().next().length);
id2gene.put("367", ar);
}
genesMap = new HashMap<String, Gene>();
associate(trips, id2gene);
}
private static Map<String, double[]> selectFirst(Map<String, List<double[]>> id2vals)
{
Map<String, double[]> map = new HashMap<String,double[]>();
for (String key : id2vals.keySet())
{
map.put(key, id2vals.get(key).iterator().next());
}
return map;
}
private static Map<String, double[]> selectMostVaried(Map<String, List<double[]>> id2vals)
{
Map<String, double[]> map = new HashMap<String,double[]>();
for (String key : id2vals.keySet())
{
double max = 0;
double[] v = null;
for (double[] vals : id2vals.get(key))
{
double var = Summary.variance(vals);
if (var > max)
{
max = var;
v = vals;
}
}
if (v != null) map.put(key, v);
}
return map;
}
public static void associateAndClean(List<Triplet> trips, String filename) throws Throwable
{
Set<String> genes = getIDs(trips);
Map<String, Gene> map = fetchGenes(genes, filename);
genesMap = new HashMap<String, Gene>();
associate(trips, map);
}
private static void associate(List<Triplet> trips, Map<String, Gene> map)
{
Iterator<Triplet> iter = trips.iterator();
while (iter.hasNext())
{
Triplet t = iter.next();
// t.M = copy(map.get(t.modulator), t.mod_id);
// t.F = copy(map.get(t.factor), t.fac_id);
// t.T = copy(map.get(t.target), t.tar_id);
t.M = map.get(t.modulator);
t.F = map.get(t.factor);
t.T = map.get(t.target);
if (t.M == null || t.F == null || t.T == null) iter.remove();
}
}
private static Set<String> getIDs(List<Triplet> trips)
{
Set<String> genes = new HashSet<String>();
for (Triplet t : trips)
{
t.backFromURLToIDs();
genes.add(t.modulator);
genes.add(t.factor);
genes.add(t.target);
}
return genes;
}
static Map<String, Gene> genesMap;
static Gene copy(Gene g, String id)
{
if (g == null) return null;
if (genesMap.containsKey(id)) return genesMap.get(id);
Gene gene = new Gene(g.geneid, id, g.value.length);
System.arraycopy(g.value, 0, gene.value, 0, g.value.length);
genesMap.put(id, gene);
return gene;
}
public static Map<String, List<double[]>> readCross(String platfile, String datafile,
Set<String> egIDs) throws Throwable
{
TabDelimitedFileParser parser = new TabDelimitedFileParser(platfile);
String[] egCols = new String[]{"ENTREZ_GENE_ID", "Entrez_Gene_ID", "GENE_SYMBOL", "GENE"};
String egCol = null;
for (String col : egCols)
{
if (FileUtil.columnsLineContains(platfile, col))
{
egCol = col;
break;
}
}
Map<String, Set<String>> eg2id = parser.getOneToManyMap(egCol, "ID");
Map<String, String> g2s = Triplet.getGeneToSymbolMap();
Set<String> idSet = new HashSet<String>();
for (String eg : egIDs)
{
if (g2s.containsKey(eg) && eg2id.containsKey(g2s.get(eg)))
{
eg = g2s.get(eg);
}
if (eg2id.containsKey(eg)) idSet.addAll(eg2id.get(eg));
}
Map<String, double[]> id2val = readCrossPlatExp(datafile, idSet);
Map<String, List<double[]>> eg2vals = new HashMap<String, List<double[]>>();
for (String eg : egIDs)
{
String egg = eg;
if (g2s.containsKey(eg) && eg2id.containsKey(g2s.get(eg)))
{
egg = g2s.get(eg);
}
if (eg2id.containsKey(egg))
{
for (String id : eg2id.get(egg))
{
if (id2val.containsKey(id))
{
if (!eg2vals.containsKey(eg)) eg2vals.put(eg, new ArrayList<double[]>());
eg2vals.get(eg).add(id2val.get(id));
}
}
}
}
return eg2vals;
}
public static void associateWithGSE6919(List<Triplet> trips, String platform) throws Throwable
{
Set<String> genes = getIDs(trips);
Map<String, double[]> id2val = readGSE6919_exps(platform, genes);
Map<String, Gene> id2gene = convertValuesToGenes(id2val);
Gene ar = new Gene("367", "367|367", id2val.values().iterator().next().length);
id2gene.put("367", ar);
Iterator<Triplet> iter = trips.iterator();
while (iter.hasNext())
{
Triplet t = iter.next();
t.M = id2gene.get(t.modulator);
t.F = id2gene.get(t.factor);
t.T = id2gene.get(t.target);
if (t.M == null || t.F == null || t.T == null) iter.remove();
}
}
private static Map<String, Gene> convertValuesToGenes(Map<String, double[]> id2val)
{
Map<String, Gene> id2gene = new HashMap<String, Gene>();
for (String id : id2val.keySet())
{
double[] val = id2val.get(id);
String sym = Triplet.getGeneToSymbolMap().get(id);
if (sym == null) sym = id;
Gene gene = new Gene(id, sym, val.length);
id2gene.put(id, gene);
System.arraycopy(val, 0, gene.value, 0, val.length);
}
return id2gene;
}
public static Map<String, double[]> readGSE6919_exps(String platform, Set<String> ids) throws Throwable
{
Map<String, List<double[]>> m = readCross("resource/expdata/GSE6919/" + platform + ".txt",
"resource/expdata/GSE6919/GSE6919-" + platform + "_series_matrix.txt", ids);
for (String id : m.keySet())
{
for (double[] v : m.get(id))
{
for (int i = 0; i < v.length; i++)
{
v[i] = Math.log(v[i]);
}
}
}
boolean[] posN = getPosInGSE6919(platform, true);
boolean[] posC = getPosInGSE6919(platform, false);
Map<String, double[]> map = selectMostChanged(m, posN, posC);
return map;
}
public static Map<String, double[]> selectMostChanged(Map<String, List<double[]>> map,
boolean[] posNormal, boolean[] posCancer)
{
Map<String, double[]> mc = new HashMap<String,double[]>();
for (String id : map.keySet())
{
double[] most = null;
double max = -1;
for (double[] v : map.get(id))
{
Gene g = new Gene("", "", 0);
g.value = v;
double ch = Math.abs(CellTypeMatcher.getChangeBetweenTissues(
g, posNormal, posCancer) - .5);
if (ch > max)
{
max = ch;
most = v;
}
}
assert most != null;
mc.put(id, most);
}
return mc;
}
public static List<String> getGSE6919_names(String platform) throws IOException
{
List<String> names = null;
BufferedReader reader = new BufferedReader(new FileReader("resource/expdata/GSE6919/GSE6919-" + platform + "_series_matrix.txt"));
for (String line = reader.readLine(); line != null; line = reader.readLine())
{
if (line.startsWith("!") || line.length() == 0) continue;
if (line.startsWith("\"ID"))
{
String[] tokens = line.substring(line.indexOf("\"\t") + 2).split("\t");
names = new ArrayList<String>(tokens.length);
for (String token : tokens)
{
names.add(token.substring(1, token.length() - 1));
}
break;
}
}
reader.close();
return names;
}
public static Set<String> getGSE6919Normal()
{
return getGSE6919Select("Normal", "free");
}
public static Set<String> getGSE6919Cancer()
{
return getGSE6919Select("umor samples");
}
public static Set<String> getGSE6919Select(String... words)
{
Set<String> set = new HashSet<String>();
TabDelimitedFileParser parser = new TabDelimitedFileParser(
"resource/expdata/GSE6919/exptypes.txt");
Map<String, String> map = parser.getOneToOneMap("ID", "Description");
for (String exp : map.keySet())
{
String desc = map.get(exp);
boolean containsAll = true;
for (String word : words)
{
if (!desc.contains(word))
{
containsAll = false;
break;
}
}
if (containsAll) set.add(exp);
}
return set;
}
public static boolean[] getPosInGSE6919(String platform, boolean normal) throws IOException
{
List<String> names = getGSE6919_names(platform);
Set<String> select = normal ? getGSE6919Normal() : getGSE6919Cancer();
boolean[] pos = new boolean[names.size()];
for (int i = 0; i < pos.length; i++)
{
pos[i] = select.contains(names.get(i));
}
return pos;
}
}