package gem;
import gem.parser.TabDelimitedFileParser;
import gem.util.Summary;
import java.util.*;
import java.util.regex.Pattern;
* Assumes there is one series matrix file named data.txt, and one platform file named platform.txt.
* @author Ozgun Babur
public class ExpDataReader implements Constants
public static void main(String[] args) throws IOException
Set<String> ids = new HashSet<String>();
Map<String, double[]> map = readSubset(ids, "resource/expdata/duo", 10, 0.25);
public static List<Triplet> associate(List<Triplet> trips, String dir, double minvar, double minvarlog) throws IOException
Set<String> egids = Triplet.getGeneIDs(trips);
Map<String, double[]> eg2val = readSubset(egids, dir, minvar, minvarlog);
Map<String, Gene> eg2gene = new HashMap<String, Gene>();
List<Triplet> list = new ArrayList<Triplet>();
for (Triplet t : trips)
t.M = getGene(t.modulator, eg2val, eg2gene);
t.F = getGene(t.factor, eg2val, eg2gene);
t.T = getGene(, eg2val, eg2gene);
if (t.M != null && t.F != null && t.T != null)
t.mod_id =;
t.fac_id =;
t.tar_id =;
return list;
public static List<Tuple> associateTuples(List<Tuple> tuples, String dir, double minvar, double minvarlog) throws IOException
Set<String> egids = Tuple.getGeneIDs(tuples);
Map<String, double[]> eg2val = readSubset(egids, dir, minvar, minvarlog);
Map<String, Gene> eg2gene = new HashMap<String, Gene>();
List<Tuple> list = new ArrayList<Tuple>();
for (Tuple t : tuples)
Gene U = getGene(t.u_id, eg2val, eg2gene);
Gene T = getGene(t.t_id, eg2val, eg2gene);
if (U != null && T != null)
t.setGenes(U, T);
return list;
private static Gene getGene(String egid, Map<String, double[]> eg2val, Map<String, Gene> eg2gene)
if (eg2gene.containsKey(egid)) return eg2gene.get(egid);
double[] vals = eg2val.get(egid);
if (vals == null) return null;
Gene gene = new Gene(egid, egid+"|"+egid, vals.length);
gene.value = vals;
eg2gene.put(egid, gene);
return gene;
public static Map<String, Gene> readGenes(Set<String> egids, String dir, double minvar, double minvarlog) throws IOException
Map<String, double[]> id2val = readSubset(egids, dir, minvar, minvarlog);
Map<String, Gene> id2gene = new HashMap<String, Gene>();
for (String id : id2val.keySet())
id2gene.put(id, new Gene(id, id2val.get(id)));
return id2gene;
public static Map<String, Gene> readMouseHomologs(Set<String> humanids, String dir,
double minvar, double minvarlog) throws IOException
System.out.println("initial size = " + humanids.size());
TabDelimitedFileParser p = new TabDelimitedFileParser("resource/human2mouse.txt");
Map<String, String> hum2mouStr = p.getOneToOneMap("Human", "Mouse");
Map<String, List<String>> hum2mous = new HashMap<String, List<String>>();
Map<String, List<String>> mou2hums = new HashMap<String, List<String>>();
for (String hum : hum2mouStr.keySet())
String str = hum2mouStr.get(hum);
if (str.startsWith("-")) continue;
String[] ids = str.split(";");
if (ids.length == 0) continue;
if (!hum2mous.containsKey(hum)) hum2mous.put(hum, new ArrayList<String>());
for (String id : ids)
id = id.trim();
if (!mou2hums.containsKey(id)) mou2hums.put(id, new ArrayList<String>());
if (mou2hums.get(id).size() > 1)
System.out.println("mouse " + id + " maps to more than one human id");
Set<String> mouseIDs = new HashSet<String>();
for (String humanid : humanids)
if (!hum2mous.containsKey(humanid)) continue;
Map<String, double[]> id2val = readSubset(mouseIDs, dir, minvar, minvarlog);
Map<String, List<double[]>> hum2vals = new HashMap<String, List<double[]>>();
for (String mou : id2val.keySet())
for (String hum : mou2hums.get(mou))
if (!hum2vals.containsKey(hum)) hum2vals.put(hum, new ArrayList<double[]>());
Map<String, double[]> hum2val = selectMostVaried(hum2vals);
Map<String, Gene> id2gene = new HashMap<String, Gene>();
for (String id : hum2val.keySet())
id2gene.put(id, new Gene(id, hum2val.get(id)));
System.out.println("mapped size = " + id2gene.size());
return id2gene;
public static Map<String, double[]> readSubset(Set<String> egids, String dir,
double minvar, double minVarLog) throws IOException
TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "/platform.txt");
Map<String, Set<String>> eg2id = parser.getOneToManyMap("ENTREZ_GENE_ID", "ID");
for (String eg : new HashSet<String>(eg2id.keySet()))
if (eg.contains("/"))
String[] tok = eg.split(" ");
for (String token : tok)
if (!token.startsWith("/")) eg2id.put(token, eg2id.get(eg));
for (String eg : new HashSet<String>(eg2id.keySet()))
if (!egids.contains(eg)) eg2id.remove(eg);
Map<String, String> id2eg = new HashMap<String, String>();
for (String eg : eg2id.keySet())
for (String id : eg2id.get(eg))
id2eg.put(id, eg);
Map<String, Set<double[]>> eg2rows = new HashMap<String, Set<double[]>>();
BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));
// skip header
for (String line = reader.readLine(); line != null; line = reader.readLine())
String id = line.substring(0, line.indexOf("\t")).replace("\"", "");
if (!id2eg.containsKey(id)) continue;
line = line.substring(line.indexOf("\t") + 1);
String[] tokens = line.split("\t");
double[] row = new double[tokens.length];
for (int i = 0; i < tokens.length; i++)
row[i] = Double.parseDouble(tokens[i]);
// Comment in this line if the expressions are already in log scale
// row[i] = Math.exp(row[i]);
String eg = id2eg.get(id);
if (!eg2rows.containsKey(eg)) eg2rows.put(eg, new HashSet<double[]>());
Map<String, double[]> eg2row = new HashMap<String, double[]>();
for (String eg : eg2rows.keySet())
Set<double[]> rows = eg2rows.get(eg);
assert !rows.isEmpty();
if (rows.size() == 1)
eg2row.put(eg, rows.iterator().next());
double[] maxrow = null;
double maxvar = -1;
for (double[] row : rows)
double var = Summary.variance(row);
if (var > maxvar)
maxvar = var;
maxrow = row;
if (maxrow != null) eg2row.put(eg, maxrow);
for (String egid : new HashSet<String>(eg2row.keySet()))
double[] vals = eg2row.get(egid);
double var = Math.log(Summary.variance(vals));
double varlog = Summary.varLog(vals);
if (var < minvar || varlog < minVarLog)
return eg2row;
static Map<String, double[]> selectMostVaried(Map<String, List<double[]>> id2vals)
Map<String, double[]> id2val = new HashMap<String, double[]>();
for (String id : id2vals.keySet())
List<double[]> vals = id2vals.get(id);
if (vals.size() == 1)
id2val.put(id, vals.iterator().next());
else if (vals.size() > 1)
id2val.put(id, getMostVaried(vals));
return id2val;
static double[] getMostVaried(List<double[]> list)
double maxvar = 0;
double[] x = null;
for (double[] v : list)
double var = Summary.variance(v);
if (var > maxvar)
maxvar = var;
x = v;
return x;