Package gem

Source Code of gem.VarianceAnalyzer

package gem;

import gem.parser.TabDelimitedFileParser;
import gem.util.*;

import java.io.*;
import java.util.*;

/**
* @author Ozgun Babur
*/
public class VarianceAnalyzer implements Constants
{
  public static void main(String[] args) throws Throwable
  {
//    printSummaryFor("RUNX1", "SLC8A1", "HIPK2", "DNAH1", "STAT1", "AR", "CD44", "EGFR", "FOXP2");
//    filterExpressionData("resource/expdata/expo");
    test6();
  }

  // Distribution of variance
  private static void test6() throws IOException
  {
    double range = 0.05;

    Histogram h1 = new Histogram(range);
    Histogram h2 = new Histogram(range);
    Histogram h3 = new Histogram(range);

    Histogram2D h2D1 = new Histogram2D(range);
    Histogram2D h2D1sub = new Histogram2D(range);

    Histogram2D h2D2 = new Histogram2D(range);
    Histogram2D h2D2sub = new Histogram2D(range);

    Set<Double> d1 = new HashSet<Double>();
    Set<Double> d2 = new HashSet<Double>();

    Set<Point> p1 = new HashSet<Point>();
    Set<Point> p2 = new HashSet<Point>();
    Set<Point> p3 = new HashSet<Point>();

//    String dir = "resource/expdata/GSE1133-GPL96/";
//    String dir = "resource/expdata/GSE13159/";
//    String dir = "resource/expdata/GSE7307/";
    String dir = "resource/expdata/expo_select/";
    TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "platform.txt");
    Map<String, String> id2sym = parser.getOneToOneMap("ID", "Gene Symbol");

    parser = new TabDelimitedFileParser("resource/phospho-site/ModifCounts.txt");
    Map<String, String> sym2modif = parser.getOneToOneMap("Gene", "ModifCnt");

    String[] tisIndex = getTissueIndexes();

//    Progress prg = new Progress(54675);
    Progress prg = new Progress(19800);

    BufferedReader reader = new BufferedReader(new FileReader(dir + "data.txt"));

    String line = reader.readLine();

    int k = 0;
    for (line = reader.readLine(); line != null; line = reader.readLine())
    {
      k++;
      prg.tick();

      String id = line.substring(0, line.indexOf("\t"));
      String sym = id2sym.get(id);
      boolean hasGeneSymbol = sym != null && sym.trim().length() > 0 && !sym.contains("/");

      int modif = (hasGeneSymbol && sym2modif.containsKey(sym)) ?
        Integer.parseInt(sym2modif.get(sym)) : 0;

      line = line.substring(line.indexOf("\t") + 1);

      double[] vals = parseValues(line);

//      for (int i = 0; i < vals.length; i++) vals[i] = Math.pow(Math.E, vals[i]);

      Gene gene = new Gene(sym, id, vals.length);
      gene.value = vals;
      gene.rankAdjustStatus(1D/3);
//      double spe1 = calcSPEByMeans(gene, tisIndex);
      double spe2 = calcSPEByRatios(gene, tisIndex);


      double dev1 = Summary.stdev(vals);
      double mean1 = Summary.mean(vals);

      double dev3 = dev1 / mean1;

      dev1 = Math.log(dev1);
      mean1 = Math.log(mean1);

//      if (spe1 > 2 && spe2 > 0.5 && mean1 > 8) System.out.println("high: " + sym);
//      if (spe1 < 0.15 && spe2 < 0.15 && mean1 > 7) System.out.println("low: " + sym);

      for (int i = 0; i < vals.length; i++)
      {
        vals[i] = Math.log(vals[i]);
      }
      double dev2 = Summary.stdev(vals);
      double mean2 = Summary.mean(vals);

//      h2D1.count(dev2, dev3);

      if (hasGeneSymbol)
      {
//        if (spe2 < 0.197)
//        {
//          h1.count(dev2);
//          d1.add(dev2);
//        }
//        else
//        {
//          h2.count(dev2);
//          d2.add(dev2);
//        }

        p3.add(new Point(dev1, dev2, modif > 0 ? 1. : 0.));

        h2D1.count(mean1, dev1);
        h2D2.count(mean1, dev2);
//        if (dev3 > 1)
//        if (spe2 > 0.4)
//        if (mean1 > 6)
        if (modif > 0)
//        if (hasGeneSymbol)
        {
          h2D1sub.count(mean1, dev1);
          h2D2sub.count(mean1, dev2);
//          p2.add(new Point(dev1, dev2));
        }
//        else p1.add(new Point(dev1, dev2));

      }
    }

    reader.close();
    System.out.println("lines = " + k);

    Kolmogorov.plotMovingD(p3, 3, 9, 1, 0.1);

//    h2D1.plot(false);
//    h2D2.plot(false);

    Histogram2DPlot pl = new Histogram2DPlot(h2D1, h2D1sub);
    pl.setVisible(true);
    pl = new Histogram2DPlot(h2D2, h2D2sub);
    pl.setVisible(true);

//    Histogram.printAll(h1, h2, h3);
//    h1.printTogether(h2);

//    h1.printDensity();
//    System.out.println("----");
//    h2.printDensity();
//    System.out.println("----");
//    h3.printDensity();

//    double d = Kolmogorov.calcDStat(d1, d2);
//    System.out.println("d1.size() = " + d1.size());
//    System.out.println("d2.size() = " + d2.size());
//    System.out.println("d stat = " + d);
//    System.out.println("p-val = " + Kolmogorov.calcPval(d, d1.size(), d2.size()));
  }

  private static String[] getTissueIndexes() throws IOException
  {
    String firstLine = FileUtil.getFirstLine("resource/expdata/expo/data.txt");
    firstLine = firstLine.substring(firstLine.indexOf("\""));
    String[] expNames = firstLine.split("\t");
    for (int i = 0; i < expNames.length; i++)
    {
      expNames[i] = expNames[i].substring(1, expNames[i].length() - 1);
    }
    TabDelimitedFileParser parser = new TabDelimitedFileParser("resource/celltypes_expO.txt");
    Map<String, String> exp2tiss = parser.getOneToOneMap("Sample", "Tissue");

    for (String exp : new HashSet<String>(exp2tiss.keySet()))
    {
//      exp2tiss.put(exp, exp2tiss.get(exp).substring(0, exp2tiss.get(exp).indexOf(" ")).toLowerCase());
      exp2tiss.put(exp, exp2tiss.get(exp).toLowerCase());
    }
    String[] index = new String[expNames.length];

    for (int i = 0; i < index.length; i++)
    {
      index[i] = exp2tiss.get(expNames[i]);
    }
    return index;
  }

  private static double getTissueSpecificity(Gene gene, String[] tissName)
  {
    Map<String, int[]> count = new HashMap<String, int[]>();
    Map<String, Set<Double>> values = new HashMap<String, Set<Double>>();

    double globalMean = 0;

    for (int i = 0; i < gene.status.length; i++)
    {
      globalMean += gene.value[i];

      if (!count.containsKey(tissName[i])) count.put(tissName[i], new int[] {0, 0});
      if (!values.containsKey(tissName[i])) values.put(tissName[i], new HashSet<Double>());

      values.get(tissName[i]).add(gene.value[i]);

      if (gene.status[i] != MARGINAL)
      {
        count.get(tissName[i])[gene.status[i] == PRESENT ? 1 : 0]++;
      }
    }

    globalMean = globalMean / gene.value.length;

    double maxMetric = -Double.MAX_VALUE;

    Set<Double> metrics = new HashSet<Double>();

    for (String tis : values.keySet())
    {

      Set<Double> vals = values.get(tis);

      if (vals.size() < 20) continue;

      int over = 0;
      for (Double val : vals)
      {
        if (val > globalMean) over++;
      }

      double metric = over / (double) vals.size();
//-------
//      int ca = count.get(tis)[0];
//      int cp = count.get(tis)[1];
//
//      double total = ca + cp;
//      if (total < 20) continue;
//
//      double mean = cp / total;
//-------

//-------
//      Set<Double> vals = values.get(tis);
//
//      if (vals.size() < 20) continue;
//
//      double mean = 0;
//      for (Double v : vals) mean += v;
//      mean /= vals.size();
//      mean = Math.log(mean);
//-------

      metrics.add(metric);

      if (metric > maxMetric) maxMetric = metric;
    }

    double dif = 0;
    for (Double metric : metrics)
    {
      dif += maxMetric - metric;
    }
    return dif / metrics.size();

//    return maxMean - minMean;

//    int leftSurplus = 0;
//    int rightSurplus = 0;
//    double ent = 0;
//    for (String tiss : count.keySet())
//    {
//      double p = count.get(tiss)[1] / (double) (count.get(tiss)[1] + count.get(tiss)[0]);
//      if (p > 0.0000001 && p < 0.9999999)
//      {
//        ent += - p * Math.log(p);
//      }
//    }
//    return ent;

//    {
//      if (count.get(tiss)[1] + count.get(tiss)[0] < 10) continue;
//
//      int dif = count.get(tiss)[1] - count.get(tiss)[0];
//      if (dif > 0) rightSurplus += dif;
//      else leftSurplus -= dif;
//    }
//    return Math.log(leftSurplus * rightSurplus);
  }

  private static double calcSPEByEntropy(Gene gene, String[] tissName)
  {
    Map<String, int[]> count = new HashMap<String, int[]>();

    for (int i = 0; i < gene.status.length; i++)
    {
      if (!count.containsKey(tissName[i])) count.put(tissName[i], new int[] {0, 0});

      if (gene.status[i] != MARGINAL)
      {
        count.get(tissName[i])[gene.status[i] == PRESENT ? 1 : 0]++;
      }
    }

    double ent = 0;
    for (String tiss : count.keySet())
    {
      double p = count.get(tiss)[1] / (double) (count.get(tiss)[1] + count.get(tiss)[0]);
      if (p > 0.0000001 && p < 0.9999999)
      {
        ent += - p * Math.log(p);
      }
    }
    return ent;
  }

  static double calcSPEByMeans(Gene gene, String[] tissName)
  {
    Map<String, Set<Double>> values = getTissueValues(gene, tissName);

    double maxMetric = -Double.MAX_VALUE;

    Map<String, Double> metrics = new HashMap<String, Double>();

    for (String tis : values.keySet())
    {
      Set<Double> vals = values.get(tis);

      if (vals.size() < 20) continue;

      double sum = 0;
      for (Double val : vals)
      {
        sum += val;
      }

      double metric = Math.log(sum / vals.size());

      metrics.put(tis, metric);

      if (metric > maxMetric) maxMetric = metric;
    }

//    if (gene.geneid.equals("SLC3A1")) printTissueMetrics(gene, metrics);

    double dif = 0;
    for (Double metric : metrics.values())
    {
      dif += maxMetric - metric;
    }
    return dif / (metrics.size() - 1);
  }

  static double calcSPEByRatios(Gene gene, String[] tissName)
  {
    Map<String, Set<Double>> values = getTissueValues(gene, tissName);

    double mid = Summary.median(gene.value);

    double maxMetric = -Double.MAX_VALUE;

    Map<String, Double> metrics = new HashMap<String, Double>();

    for (String tis : values.keySet())
    {
      Set<Double> vals = values.get(tis);

      if (vals.size() < 20) continue;

      int over = 0;
      for (Double val : vals)
      {
        if (val > mid) over++;
      }

      double metric = over / (double) vals.size();

      metrics.put(tis, metric);

      if (metric > maxMetric) maxMetric = metric;
    }

//    if (gene.geneid.equals("SLC3A1")) printTissueMetrics(gene, metrics);

    double dif = 0;
    for (Double metric : metrics.values())
    {
      dif += maxMetric - metric;
    }
    return dif / (metrics.size() - 1);
  }

  static private void printTissueMetrics(Gene gene, Map<String, Double> values)
  {
    System.out.println("\n" + gene.geneid);
    for (String tis : values.keySet())
    {
      System.out.println(tis + " " + values.get(tis));
    }
  }

  private static Map<String, Set<Double>> getTissueValues(Gene gene, String[] tissName)
  {
    Map<String, Set<Double>> values = new HashMap<String, Set<Double>>();

    for (int i = 0; i < gene.status.length; i++)
    {
      if (!values.containsKey(tissName[i]))
        values.put(tissName[i], new HashSet<Double>());

      values.get(tissName[i]).add(gene.value[i]);
    }
    return values;
  }

  /**
   * Leaves one row per gene symbol. Selects rows with max mean expression.
   * @param dir
   */
  private static void filterExpressionData(String dir) throws IOException
  {
    TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "/platform.txt");
    Map<String, String> id2sym = parser.getOneToOneMap("ID", "Gene Symbol");

    Map<String, String> sym2row = new HashMap<String, String>();
    Map<String, Double> sym2mean = new HashMap<String, Double>();
    Map<String, Integer> sym2cnt = new HashMap<String, Integer>();

    Progress prg = new Progress(54675);

    BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));

    String outDir = dir + "_select";
    File file = new File(outDir);
    file.mkdir();

    BufferedWriter writer = new BufferedWriter(new FileWriter(outDir + "/data.txt"));

    String line = reader.readLine();
    writer.write(line);

    for (line = reader.readLine(); line != null; line = reader.readLine())
    {
      prg.tick();

      String id = line.substring(0, line.indexOf("\t"));
      String sym = id2sym.get(id);
      boolean hasGeneSymbol = sym != null && sym.trim().length() > 0 && !sym.contains("/");

      if (hasGeneSymbol)
      {
        if (sym2cnt.containsKey(sym)) sym2cnt.put(sym, sym2cnt.get(sym) + 1);
        else sym2cnt.put(sym, 1);

        double[] val = parseValues(line.substring(line.indexOf("\t") + 1));
        double mean = Summary.mean(val);

        if (!sym2row.containsKey(sym) || sym2mean.get(sym) < mean)
        {
          sym2mean.put(sym, mean);
          sym2row.put(sym, line);
        }
      }
    }

    reader.close();

    for (String row : sym2row.values())
    {
      writer.write("\n" + row);
    }

    writer.close();

    Histogram h = new Histogram(1);
    for (String sym : sym2cnt.keySet())
    {
      int cnt = sym2cnt.get(sym);
      h.count(cnt);

      if (cnt > 7)
      {
        System.out.println(sym + "\t" + cnt);
      }
    }
    h.print();
  }

  private static double[] parseValues(String line)
  {
    String[] token = line.split("\t");
    double[] val = new double[token.length];
    for (int i = 0; i < token.length; i++)
    {
      val[i] = Double.parseDouble(token[i]);
    }
    return val;
  }

  private static void printSummaryFor(String... sym) throws IOException
  {
    Map<String, List<double[]>> map = getValuesOf(sym);

    for (String s : map.keySet())
    {
      System.out.println("\n" + s);
      List<double[]> vals = map.get(s);

      double[] x = new double[vals.size()];
      double[] y = new double[vals.size()];
      int i = 0;

      for (double[] val : vals)
      {
        double mean = Math.log(Summary.mean(val));
        double logdev = Math.log(Summary.stdev(val));
        double devlog = Summary.stdev(Summary.log(val));

        x[i] = mean;
        y[i++] = logdev;

        System.out.println(mean + "\t" + logdev + "\t" + devlog);
      }

      double[] r = LinearRegression.regress(x, y);
      System.out.println("a = " + r[0]);
      System.out.println("r2 = " + r[2]);

    }
  }

  private static Map<String, List<double[]>> getValuesOf(String[] sym) throws IOException
  {
    Map<String, List<double[]>> map = new HashMap<String, List<double[]>>();

    String dir = "resource/expdata/expo/";
    TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "platform.txt");
    Map<String, Set<String>> sym2id = parser.getOneToManyMap("Gene Symbol", "ID");

    Map<String, String> id2sym = new HashMap<String, String>();

    for (String s : sym)
    {
      if (!sym2id.containsKey(s)) continue;

      for (String id : sym2id.get(s))
      {
        id2sym.put(id, s);
      }
    }

    Set<String> ids = id2sym.keySet();

    BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));

    for (String line = reader.readLine(); line != null; line = reader.readLine())
    {
      String rowID = line.substring(0, line.indexOf("\t"));

      if (ids.contains(rowID))
      {
        String s = id2sym.get(rowID);
        if (!map.containsKey(s)) map.put(s, new ArrayList<double[]>());
        map.get(s).add(parseValues(line.substring(line.indexOf("\t") + 1)));
      }
    }

    return map;
  }
}
TOP

Related Classes of gem.VarianceAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.