package gem;
import gem.parser.TabDelimitedFileParser;
import gem.util.Summary;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;
/**
* Assumes there is one series matrix file named data.txt, and one platform file named platform.txt.
*
* @author Ozgun Babur
*/
public class ExpDataReader implements Constants
{
public static void main(String[] args) throws IOException
{
Set<String> ids = new HashSet<String>();
ids.add("367");
Map<String, double[]> map = readSubset(ids, "resource/expdata/duo", 10, 0.25);
System.out.println(map.size());
}
public static List<Triplet> associate(List<Triplet> trips, String dir, double minvar, double minvarlog) throws IOException
{
Set<String> egids = Triplet.getGeneIDs(trips);
Map<String, double[]> eg2val = readSubset(egids, dir, minvar, minvarlog);
Map<String, Gene> eg2gene = new HashMap<String, Gene>();
List<Triplet> list = new ArrayList<Triplet>();
for (Triplet t : trips)
{
t.M = getGene(t.modulator, eg2val, eg2gene);
t.F = getGene(t.factor, eg2val, eg2gene);
t.T = getGene(t.target, eg2val, eg2gene);
if (t.M != null && t.F != null && t.T != null)
{
t.mod_id = t.M.id;
t.fac_id = t.F.id;
t.tar_id = t.T.id;
list.add(t);
}
}
return list;
}
public static List<Tuple> associateTuples(List<Tuple> tuples, String dir, double minvar, double minvarlog) throws IOException
{
Set<String> egids = Tuple.getGeneIDs(tuples);
Map<String, double[]> eg2val = readSubset(egids, dir, minvar, minvarlog);
Map<String, Gene> eg2gene = new HashMap<String, Gene>();
List<Tuple> list = new ArrayList<Tuple>();
for (Tuple t : tuples)
{
Gene U = getGene(t.u_id, eg2val, eg2gene);
Gene T = getGene(t.t_id, eg2val, eg2gene);
if (U != null && T != null)
{
t.setGenes(U, T);
list.add(t);
}
}
return list;
}
private static Gene getGene(String egid, Map<String, double[]> eg2val, Map<String, Gene> eg2gene)
{
if (eg2gene.containsKey(egid)) return eg2gene.get(egid);
double[] vals = eg2val.get(egid);
if (vals == null) return null;
Gene gene = new Gene(egid, egid+"|"+egid, vals.length);
gene.value = vals;
eg2gene.put(egid, gene);
return gene;
}
public static Map<String, Gene> readGenes(Set<String> egids, String dir, double minvar, double minvarlog) throws IOException
{
Map<String, double[]> id2val = readSubset(egids, dir, minvar, minvarlog);
Map<String, Gene> id2gene = new HashMap<String, Gene>();
for (String id : id2val.keySet())
{
id2gene.put(id, new Gene(id, id2val.get(id)));
}
return id2gene;
}
public static Map<String, Gene> readMouseHomologs(Set<String> humanids, String dir,
double minvar, double minvarlog) throws IOException
{
System.out.println("initial size = " + humanids.size());
TabDelimitedFileParser p = new TabDelimitedFileParser("resource/human2mouse.txt");
Map<String, String> hum2mouStr = p.getOneToOneMap("Human", "Mouse");
Map<String, List<String>> hum2mous = new HashMap<String, List<String>>();
Map<String, List<String>> mou2hums = new HashMap<String, List<String>>();
for (String hum : hum2mouStr.keySet())
{
String str = hum2mouStr.get(hum);
if (str.startsWith("-")) continue;
String[] ids = str.split(";");
if (ids.length == 0) continue;
if (!hum2mous.containsKey(hum)) hum2mous.put(hum, new ArrayList<String>());
for (String id : ids)
{
id = id.trim();
hum2mous.get(hum).add(id);
if (!mou2hums.containsKey(id)) mou2hums.put(id, new ArrayList<String>());
mou2hums.get(id).add(hum);
if (mou2hums.get(id).size() > 1)
{
System.out.println("mouse " + id + " maps to more than one human id");
}
}
}
Set<String> mouseIDs = new HashSet<String>();
for (String humanid : humanids)
{
if (!hum2mous.containsKey(humanid)) continue;
mouseIDs.addAll(hum2mous.get(humanid));
}
Map<String, double[]> id2val = readSubset(mouseIDs, dir, minvar, minvarlog);
Map<String, List<double[]>> hum2vals = new HashMap<String, List<double[]>>();
for (String mou : id2val.keySet())
{
for (String hum : mou2hums.get(mou))
{
if (!hum2vals.containsKey(hum)) hum2vals.put(hum, new ArrayList<double[]>());
hum2vals.get(hum).add(id2val.get(mou));
}
}
Map<String, double[]> hum2val = selectMostVaried(hum2vals);
Map<String, Gene> id2gene = new HashMap<String, Gene>();
for (String id : hum2val.keySet())
{
id2gene.put(id, new Gene(id, hum2val.get(id)));
}
System.out.println("mapped size = " + id2gene.size());
return id2gene;
}
public static Map<String, double[]> readSubset(Set<String> egids, String dir,
double minvar, double minVarLog) throws IOException
{
TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "/platform.txt");
Map<String, Set<String>> eg2id = parser.getOneToManyMap("ENTREZ_GENE_ID", "ID");
for (String eg : new HashSet<String>(eg2id.keySet()))
{
if (eg.contains("/"))
{
String[] tok = eg.split(" ");
for (String token : tok)
{
if (!token.startsWith("/")) eg2id.put(token, eg2id.get(eg));
}
eg2id.remove(eg);
}
}
for (String eg : new HashSet<String>(eg2id.keySet()))
{
if (!egids.contains(eg)) eg2id.remove(eg);
}
Map<String, String> id2eg = new HashMap<String, String>();
for (String eg : eg2id.keySet())
{
for (String id : eg2id.get(eg))
{
id2eg.put(id, eg);
}
}
Map<String, Set<double[]>> eg2rows = new HashMap<String, Set<double[]>>();
BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));
// skip header
reader.readLine();
for (String line = reader.readLine(); line != null; line = reader.readLine())
{
String id = line.substring(0, line.indexOf("\t")).replace("\"", "");
if (!id2eg.containsKey(id)) continue;
line = line.substring(line.indexOf("\t") + 1);
String[] tokens = line.split("\t");
double[] row = new double[tokens.length];
for (int i = 0; i < tokens.length; i++)
{
row[i] = Double.parseDouble(tokens[i]);
// Comment in this line if the expressions are already in log scale
// row[i] = Math.exp(row[i]);
}
String eg = id2eg.get(id);
if (!eg2rows.containsKey(eg)) eg2rows.put(eg, new HashSet<double[]>());
eg2rows.get(eg).add(row);
}
reader.close();
Map<String, double[]> eg2row = new HashMap<String, double[]>();
for (String eg : eg2rows.keySet())
{
Set<double[]> rows = eg2rows.get(eg);
assert !rows.isEmpty();
if (rows.size() == 1)
{
eg2row.put(eg, rows.iterator().next());
}
else
{
double[] maxrow = null;
double maxvar = -1;
for (double[] row : rows)
{
double var = Summary.variance(row);
if (var > maxvar)
{
maxvar = var;
maxrow = row;
}
}
if (maxrow != null) eg2row.put(eg, maxrow);
}
}
for (String egid : new HashSet<String>(eg2row.keySet()))
{
double[] vals = eg2row.get(egid);
double var = Math.log(Summary.variance(vals));
double varlog = Summary.varLog(vals);
if (var < minvar || varlog < minVarLog)
{
eg2row.remove(egid);
}
}
return eg2row;
}
static Map<String, double[]> selectMostVaried(Map<String, List<double[]>> id2vals)
{
Map<String, double[]> id2val = new HashMap<String, double[]>();
for (String id : id2vals.keySet())
{
List<double[]> vals = id2vals.get(id);
if (vals.size() == 1)
{
id2val.put(id, vals.iterator().next());
}
else if (vals.size() > 1)
{
id2val.put(id, getMostVaried(vals));
}
}
return id2val;
}
static double[] getMostVaried(List<double[]> list)
{
double maxvar = 0;
double[] x = null;
for (double[] v : list)
{
double var = Summary.variance(v);
if (var > maxvar)
{
maxvar = var;
x = v;
}
}
return x;
}
}