Package gem

Source Code of gem.ExpDataReader

package gem;

import gem.parser.TabDelimitedFileParser;
import gem.util.Summary;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
* Assumes there is one series matrix file named data.txt, and one platform file named platform.txt.
*
* @author Ozgun Babur
*/
public class ExpDataReader implements Constants
{
  public static void main(String[] args) throws IOException
  {
    Set<String> ids = new HashSet<String>();
    ids.add("367");
    Map<String, double[]> map = readSubset(ids, "resource/expdata/duo", 10, 0.25);
    System.out.println(map.size());
  }

  public static List<Triplet> associate(List<Triplet> trips, String dir, double minvar, double minvarlog) throws IOException
  {
    Set<String> egids = Triplet.getGeneIDs(trips);
    Map<String, double[]> eg2val = readSubset(egids, dir, minvar, minvarlog);
    Map<String, Gene> eg2gene = new HashMap<String, Gene>();

    List<Triplet> list = new ArrayList<Triplet>();
    for (Triplet t : trips)
    {
      t.M = getGene(t.modulator, eg2val, eg2gene);
      t.F = getGene(t.factor, eg2val, eg2gene);
      t.T = getGene(t.target, eg2val, eg2gene);

      if (t.M != null && t.F != null && t.T != null)
      {
        t.mod_id = t.M.id;
        t.fac_id = t.F.id;
        t.tar_id = t.T.id;

        list.add(t);
      }
    }
    return list;
  }

  public static List<Tuple> associateTuples(List<Tuple> tuples, String dir, double minvar, double minvarlog) throws IOException
  {
    Set<String> egids = Tuple.getGeneIDs(tuples);
    Map<String, double[]> eg2val = readSubset(egids, dir, minvar, minvarlog);
    Map<String, Gene> eg2gene = new HashMap<String, Gene>();

    List<Tuple> list = new ArrayList<Tuple>();
    for (Tuple t : tuples)
    {
      Gene U = getGene(t.u_id, eg2val, eg2gene);
      Gene T = getGene(t.t_id, eg2val, eg2gene);

      if (U != null && T != null)
      {
        t.setGenes(U, T);
        list.add(t);
      }
    }
    return list;
  }

  private static Gene getGene(String egid, Map<String, double[]> eg2val, Map<String, Gene> eg2gene)
  {
    if (eg2gene.containsKey(egid)) return eg2gene.get(egid);

    double[] vals = eg2val.get(egid);

    if (vals == null) return null;

    Gene gene = new Gene(egid, egid+"|"+egid, vals.length);
    gene.value = vals;
    eg2gene.put(egid, gene);
    return gene;
  }

  public static Map<String, Gene> readGenes(Set<String> egids, String dir, double minvar, double minvarlog) throws IOException
  {
    Map<String, double[]> id2val = readSubset(egids, dir, minvar, minvarlog);
    Map<String, Gene> id2gene = new HashMap<String, Gene>();

    for (String id : id2val.keySet())
    {
      id2gene.put(id, new Gene(id, id2val.get(id)));
    }

    return id2gene;
  }

  public static Map<String, Gene> readMouseHomologs(Set<String> humanids, String dir,
    double minvar, double minvarlog) throws IOException
  {
    System.out.println("initial size = " + humanids.size());
    TabDelimitedFileParser p = new TabDelimitedFileParser("resource/human2mouse.txt");
    Map<String, String> hum2mouStr = p.getOneToOneMap("Human", "Mouse");

    Map<String, List<String>> hum2mous = new HashMap<String, List<String>>();
    Map<String, List<String>> mou2hums = new HashMap<String, List<String>>();
    for (String hum : hum2mouStr.keySet())
    {
      String str = hum2mouStr.get(hum);
      if (str.startsWith("-")) continue;
      String[] ids = str.split(";");
      if (ids.length == 0) continue;
      if (!hum2mous.containsKey(hum)) hum2mous.put(hum, new ArrayList<String>());
      for (String id : ids)
      {
        id = id.trim();
        hum2mous.get(hum).add(id);
        if (!mou2hums.containsKey(id)) mou2hums.put(id, new ArrayList<String>());
        mou2hums.get(id).add(hum);
        if (mou2hums.get(id).size() > 1)
        {
          System.out.println("mouse " + id + " maps to more than one human id");
        }
      }
    }

    Set<String> mouseIDs = new HashSet<String>();
    for (String humanid : humanids)
    {
      if (!hum2mous.containsKey(humanid)) continue;
      mouseIDs.addAll(hum2mous.get(humanid));
    }

    Map<String, double[]> id2val = readSubset(mouseIDs, dir, minvar, minvarlog);

    Map<String, List<double[]>> hum2vals = new HashMap<String, List<double[]>>();
    for (String mou : id2val.keySet())
    {
      for (String hum : mou2hums.get(mou))
      {
        if (!hum2vals.containsKey(hum)) hum2vals.put(hum, new ArrayList<double[]>());
        hum2vals.get(hum).add(id2val.get(mou));
      }
    }

    Map<String, double[]> hum2val = selectMostVaried(hum2vals);

    Map<String, Gene> id2gene = new HashMap<String, Gene>();

    for (String id : hum2val.keySet())
    {
      id2gene.put(id, new Gene(id, hum2val.get(id)));
    }
    System.out.println("mapped size = " + id2gene.size());

    return id2gene;
  }

  public static Map<String, double[]> readSubset(Set<String> egids, String dir,
    double minvar, double minVarLog) throws IOException
  {
    TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "/platform.txt");
    Map<String, Set<String>> eg2id = parser.getOneToManyMap("ENTREZ_GENE_ID", "ID");

    for (String eg : new HashSet<String>(eg2id.keySet()))
    {
      if (eg.contains("/"))
      {
        String[] tok = eg.split(" ");
        for (String token : tok)
        {
          if (!token.startsWith("/")) eg2id.put(token, eg2id.get(eg));
        }
        eg2id.remove(eg);
      }
    }

    for (String eg : new HashSet<String>(eg2id.keySet()))
    {
      if (!egids.contains(eg)) eg2id.remove(eg);
    }

    Map<String, String> id2eg = new HashMap<String, String>();

    for (String eg : eg2id.keySet())
    {
      for (String id : eg2id.get(eg))
      {
        id2eg.put(id, eg);
      }
    }

    Map<String, Set<double[]>> eg2rows = new HashMap<String, Set<double[]>>();
    BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));

    // skip header
    reader.readLine();

    for (String line = reader.readLine(); line != null; line = reader.readLine())
    {
      String id = line.substring(0, line.indexOf("\t")).replace("\"", "");

      if (!id2eg.containsKey(id)) continue;

      line = line.substring(line.indexOf("\t") + 1);

      String[] tokens = line.split("\t");
      double[] row = new double[tokens.length];
      for (int i = 0; i < tokens.length; i++)
      {
        row[i] = Double.parseDouble(tokens[i]);

        // Comment in this line if the expressions are already in log scale
//        row[i] = Math.exp(row[i]);
      }

      String eg = id2eg.get(id);

      if (!eg2rows.containsKey(eg)) eg2rows.put(eg, new HashSet<double[]>());
      eg2rows.get(eg).add(row);
    }

    reader.close();

    Map<String, double[]> eg2row = new HashMap<String, double[]>();

    for (String eg : eg2rows.keySet())
    {
      Set<double[]> rows = eg2rows.get(eg);
      assert !rows.isEmpty();
      if (rows.size() == 1)
      {
        eg2row.put(eg, rows.iterator().next());
      }
      else
      {
        double[] maxrow = null;
        double maxvar = -1;

        for (double[] row : rows)
        {
          double var = Summary.variance(row);
          if (var > maxvar)
          {
            maxvar = var;
            maxrow = row;
          }
        }

        if (maxrow != null) eg2row.put(eg, maxrow);
      }
    }

    for (String egid : new HashSet<String>(eg2row.keySet()))
    {
      double[] vals = eg2row.get(egid);

      double var = Math.log(Summary.variance(vals));
      double varlog = Summary.varLog(vals);

      if (var < minvar || varlog < minVarLog)
      {
        eg2row.remove(egid);
      }
    }

    return eg2row;
  }

  static Map<String, double[]> selectMostVaried(Map<String, List<double[]>> id2vals)
  {
    Map<String, double[]> id2val = new HashMap<String, double[]>();

    for (String id : id2vals.keySet())
    {
      List<double[]> vals = id2vals.get(id);
      if (vals.size() == 1)
      {
        id2val.put(id, vals.iterator().next());
      }
      else if (vals.size() > 1)
      {
        id2val.put(id, getMostVaried(vals));
      }
    }
    return id2val;
  }

  static double[] getMostVaried(List<double[]> list)
  {
    double maxvar = 0;
    double[] x = null;
    for (double[] v : list)
    {
      double var = Summary.variance(v);
      if (var > maxvar)
      {
        maxvar = var;
        x = v;
      }
    }
    return x;
  }
}
TOP

Related Classes of gem.ExpDataReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.