Package gem

Source Code of gem.ClusterAnalyzer

package gem;

import gem.parser.HPRDParser;
import gem.parser.TabDelimitedFileParser;
import gem.util.*;

import java.awt.*;
import java.awt.geom.QuadCurve2D;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import java.util.List;

import static gem.StageAnalyzer.getPos;

/**
* @author Ozgun Babur
*/
public class ClusterAnalyzer implements Constants
{
  public static final double CORR_PV_THR = 1;
  public static final double SEED_EDGE_CORR_THR = 0;
  public static final double CLUSTER_SCORE_THR = 0.1;
  public static final int MODULE_SIZE_THR = 5;
  public static final int AVG_BINS_PER_GENE = 5;

  public static void main(String[] args) throws Throwable
  {
    TabDelimitedFileParser p = new TabDelimitedFileParser("resource/factors/AR-select-small.txt");
    Set<String> tarSym = p.getColumnSet(0);

    List<String> modList = getModSymSet();
//    modList.removeAll(tarSym);
//    modList.add("KLK3");

    Set<String> ids = new HashSet<String>();
    for (String g : tarSym) ids.add(Triplet.getSymbolToGeneMap().get(g));
    for (String g : modList) ids.add(Triplet.getSymbolToGeneMap().get(g));

    String dir = "resource/expdata/expo";
    Map<String, Gene> map = ExpDataReader.readGenes(ids, dir, 0, 0);
//    Map<String, Gene> map = CrossPlatformMapper.fetchGenes(ids, dir + "/data.txt");
    boolean[][] pos = getPos(dir + "/");
    String[] tisName = StageAnalyzer.getStageNames(dir + "/");
    Set<Gene> tars = new HashSet<Gene>(map.values());

//    CellTypeMatcher.filterExpsToStage(tars, pos[7]);

    List<Gene> mods = new ArrayList<Gene>();

    for (String modSym : modList)
    {
      Gene mod = map.get(Triplet.getSymbolToGeneMap().get(modSym));
      if (mod != null) mods.add(mod);
    }

    tars.removeAll(mods);
    System.out.println("tar size = " + tars.size());

    Gene[] allTars = tars.toArray(new Gene[tars.size()]);

    Gene[] modsArr = mods.toArray(new Gene[mods.size()]);
    Arrays.sort(modsArr);
    takeLogIfNeeded(allTars);
    takeLogIfNeeded(modsArr);
    int[] signAll = getSigns(allTars);

    allTars = filterToPositive(allTars, signAll);
    Arrays.sort(allTars);
    signAll = getSigns(allTars);

    printSignDistr(signAll);
    Gene probe = null;
//    Gene probe = get(modsArr, "FOXA1");
//    System.out.println("probe.getSymbol() = " + probe.getSymbol());

//    printCorrMatrix(modsArr);
//    writeCorrGraph(modsArr);
//    printResistanceGeneEnrichment(allTars, modsArr, signAll);
//    plotValueHisto(get(modsArr, "FLNA"), get(modsArr, "SVIL"));

    List<List<Gene>> moduleList = new ArrayList<List<Gene>>();

    int i = 5;
//    for (int i = 0; i < pos.length; i++)
    {
      System.out.println("\n\nCalculating for tissue: " + tisName[i]);
      System.out.println("sample size = " + ArrayUtils.countTrue(pos[i]));

      printModuleScores(allTars, signAll, pos[i], modsArr);
//      if (true) continue;

      List<Gene[]> modules = getCorrelatedModules(allTars, signAll, pos[i], probe);
      System.out.println("Number of modules = " + modules.size() + "\n");

      for (Gene[] module : modules)
      {
//        if (module.length < 10) continue;

        int[] sign = getSigns(module);
//        printSignDistr(sign);

        printModuleInfo(module, sign, pos[i]);

        List<Holder> h = new ArrayList<Holder>();
        List<Gene> correlatingMods = new ArrayList<Gene>();

        System.out.println("Modulator correlations:");
        for (Gene mod : mods)
        {
          if (mod == null) continue;

          Holder h0 = new Holder(mod.getSymbol(), mod, module, sign, pos[i]);

          boolean[] half = filterHalf(mod, pos[i], true);
          Holder h1 = new Holder(mod.getSymbol() + "+", mod, module, sign, half);

          half = filterHalf(mod, pos[i], false);
          Holder h2 = new Holder(mod.getSymbol() + "-", mod, module, sign, half);

//          addNecessary(h, h0, h1, h2);
          h.add(h0);
        }
        Collections.sort(h);
        List<Holder> corHold = new ArrayList<Holder>();
        for (Holder ho : h)
        {
          if (ho.pval < 0.05)
//          if (Math.abs(ho.cor) > 0.2)
          {
            System.out.println(ho);// + "\t" + getAverageCorrelation(ho.mod, module, sign, pos[5])[0]);
            correlatingMods.add(ho.mod);
            corHold.add(ho);

            if (ho.mod == probe) moduleList.add(Arrays.asList(module));
          }
        }
//        printCorrMatrix(corHold);
//        printModDependency(corHold);
        System.out.println();
      }
    }
//    moduleList.add(getGRResponseModule(allTars));
//    printModuleOVerlapMatrix(moduleList, allTars.length);
  }

  private static void writeGeneVals(Gene[] gene, boolean [] pos) throws IOException
  {
    BufferedWriter writer = new BufferedWriter(new FileWriter("sample.csv"));

//    writer.write("\"Name\"");
    String s = "";
    for (int i = 0; i < pos.length; i++)
    {
      if (pos[i]) s += "\"exp" + i + "\",";
    }
    s = s.substring(0, s.length() - 1);
    writer.write(s);

    for (Gene g : gene)
    {
      writer.write("\n");
      s = "";
      for (int i = 0; i < pos.length; i++)
      {
        if (pos[i]) s += g.value[i] + ",";
      }
      s = s.substring(0, s.length()-1);
      writer.write(s);
    }

    writer.close();
  }

  private static Gene[] filterToPositive(Gene[] gene, int[] sign)
  {
    List<Gene> filtered = new ArrayList<Gene>();
    for (int i = 0; i < gene.length; i++)
    {
      if (sign[i] == 1) filtered.add(gene[i]);
    }
    return filtered.toArray(new Gene[filtered.size()]);
  }

  static Gene get(Gene[] genes, String symb)
  {
    for (Gene gene : genes)
    {
      if (gene.getSymbol().equals(symb)) return gene;
    }
    return null;
  }

  static void plotValueHisto(Gene g1, Gene g2)
  {
    Histogram2D h = new Histogram2D(0.1);
    for (int i = 0; i < g1.value.length; i++)
    {
      h.count(g1.value[i], g2.value[i]);
    }
    h.plot();
  }

  private static void printModuleInfo(Gene[] module, int[] sign, boolean [] pos)
  {
    System.out.println("module size = " + module.length);
    System.out.println("avg expr = " + (int) getAverageExpression(module));
    System.out.println("avg corr = " + fmt.format(getAverageCorrelation(module, sign, pos)));

//    System.out.println("Members:");
//    List<String> names = new ArrayList<String>();
//    for (Gene g : module) names.add(g.getSymbol());
//    Collections.sort(names);
//    for (String name : names) System.out.println(name);
  }

  static void addNecessary(List<Holder> list, Holder... h)
  {
    List<Holder> pos = new ArrayList<Holder>();
    List<Holder> neg = new ArrayList<Holder>();

    for (Holder hld : h)
    {
      if (hld.cor > 0) pos.add(hld);
      else neg.add(hld);
    }

    Holder toAdd = null;
    for (Holder ho : pos)
    {
      if (toAdd == null) toAdd = ho;
      else if (ho.pval < toAdd.pval) toAdd = ho;
    }
    if (toAdd != null) list.add(toAdd);
    toAdd = null;
    for (Holder ho : neg)
    {
      if (toAdd == null) toAdd = ho;
      else if (ho.pval < toAdd.pval) toAdd = ho;
    }
    if (toAdd != null) list.add(toAdd);
  }

  static class Holder implements Comparable
  {
    String name;
    Gene mod;
    double cor;
    Double cmp;
    boolean[] pos;
    Gene[] module;
    int[] sign;
    Double pval;

    Holder(String name, Gene mod, Gene[] module, int[] sign, boolean [] pos)
    {
      this.mod = mod;
      this.module = module;
      this.sign = sign;
      this.pos = pos;

      if (name.length() < 4) name += "  ";
      this.name = name;

      this.cor = calcAverageCorrelation();
      cmp = Math.abs(cor);
      this.pval = Pearson.calcCorrSignificance(cor, ArrayUtils.countTrue(pos));
    }

    double calcAverageCorrelation()
    {
      double total = 0;
      int cnt = 0;
      int n = pos == null ? mod.value.length : ArrayUtils.countTrue(pos);

      for (int i = 0; i < module.length; i++)
      {
        double cor = pos == null ? Pearson.calcCorrelation(mod.value, module[i].value) :
          Pearson.calcCorrelation(mod.value, module[i].value, pos);

        double pv = Pearson.calcCorrSignificance(cor, n);

        if (pv < CORR_PV_THR)
        {
          total += sign == null ? cor : cor * sign[i];
          cnt++;

//        if (mod.getSymbol().equals("FOXA1") && pos == null)
//        {
//          int s = (int) Math.signum(cor * sign[i]);
//          System.out.println(gene[i].getSymbol() + "\t" + fmt.format(cor) + "\t" + s);
//        }
        }
      }
      return total / cnt;
    }

    public int compareTo(Object o)
    {
      int c = pval.compareTo(((Holder) o).pval);
      if (c != 0) return c;
      return ((Holder) o).cmp.compareTo(cmp);
    }

    @Override
    public String toString()
    {
      return name + "\t" + fmt.format(cor);// + "\t" + cnt;
    }
  }

  public static List<String> getModSymSet() throws Throwable
  {
//    Map<String, Set<String>> map = HPRDParser.readFor(new HashSet<String>(Arrays.asList("AR")));
//    List<String> list = new ArrayList<String>(map.get("AR"));

//    List<String> list = new ArrayList<String>(Triplet.getSymbolToGeneMap().keySet());

    List<String> list = new ArrayList<String>(TabDelimitedFileParser.getColumnSet(
      "resource/NuclearReceptors.txt", 0));

    Collections.sort(list);
    return list;
  }

  public static boolean[] filterHalf(Gene mod, boolean[] pos, boolean upperHalf)
  {
    boolean[] fil = new boolean[pos.length];
    double median = Summary.median(mod.value, pos);
    for (int i = 0; i < pos.length; i++)
    {
      fil[i] = pos[i] &&
        ((upperHalf && mod.value[i] > median) || (!upperHalf && mod.value[i] < median));
    }
    return fil;
  }

  public static void printCorrMatrix(List<Holder> corHold)
  {
    List<Gene> genes = new ArrayList<Gene>();
    for (Holder h : corHold) if (!genes.contains(h.mod)) genes.add(h.mod);

    System.out.print("    ");
    for (Holder h2 : corHold)
    {
      System.out.print("\t" + h2.name);
    }
    for (Gene g : genes)
    {
      String s = g.getSymbol();
      if (s.length() < 4) s += "  ";
      System.out.print("\n" + s);

      for (Holder h2 : corHold)
      {
        if (g == h2.mod) System.out.print("\t    ");
        else System.out.print("\t" +
          fmt.format(Pearson.calcCorrelation(g.value, h2.mod.value, h2.pos)));
      }
    }
    System.out.println();
  }

  public static void printModDependency(List<Holder> corHold)
  {
    List<Gene> genes = new ArrayList<Gene>();
    for (Holder h : corHold) if (!genes.contains(h.mod)) genes.add(h.mod);

    System.out.print("    ");
    for (Holder h2 : corHold)
    {
      System.out.print("\t" + h2.name);
    }
    for (Gene g : genes)
    {
      String s = g.getSymbol();
      if (s.length() < 4) s += "  ";
      System.out.print("\n" + s);

      for (Holder h2 : corHold)
      {
        if (g == h2.mod || !secondModDependsFirst(g, h2))
          System.out.print("\t    ");
        else System.out.print("\t----");
      }
    }
    System.out.println();
  }

  private static boolean secondModDependsFirst(Gene g1, Holder h2)
  {
    double cor = Pearson.calcCorrelation(g1.value, h2.mod.value, h2.pos);
    Holder h = new Holder("", g1, h2.module, h2.sign, h2.pos);

    double v = h.cor * cor;
    if (v * h2.cor < 0) return false;
    if (Math.abs(h2.cor) < Math.abs(v)) return true;

    double dif = Math.abs(h2.cor - v);
    double pv = Pearson.calcCorrSignificance(dif, ArrayUtils.countTrue(h2.pos));

//    double pv = Pearson.calcSignificanceOfCorrDifference(v, trueCnt, h2.cor, trueCnt) / 2;
    return pv > 0.05;
  }

  public static double[][] getCorrMatrix(Gene[] gene)
  {
    double[][] c = new double[gene.length][gene.length];
    for (int i = 0; i < c.length - 1; i++)
    {
      for (int j = i + 1; j < c.length; j++)
      {
        c[i][j] = Pearson.calcCorrelation(gene[i].value, gene[j].value);
        c[j][i] = c[i][j];
      }
    }
    return c;
  }

  public static void printSignDistr(int[] sign)
  {
    int pos = 0;
    int neg = 0;

    for (int i = 0; i < sign.length; i++)
    {
      if (sign[i] == 1) pos ++;
      else neg++;
    }
    System.out.println("pos = " + pos + "\tneg = " + neg);
  }

  public static void writeCorrGraph(Gene[] gene) throws IOException
  {
    double[][] c = getCorrMatrix(gene);
    BufferedWriter writer = new BufferedWriter(new FileWriter("graph.graphml"));
    GraphML.writeHeader(writer);

    Set<Gene> created = new HashSet<Gene>();

    for (int i = 0; i < c.length - 1; i++)
    {
      for (int j = i + 1; j < c.length; j++)
      {
        if (Math.abs(c[i][j]) > 0.7)
        {
          createNode(gene[i], writer, created);
          createNode(gene[j], writer, created);
          writer.write(GraphML.createEdgeData(gene[i].getSymbol(), gene[j].getSymbol(),
            GraphML.getColor(c[i][j]), true, false));
        }
      }
    }

    GraphML.writeFooter(writer);
    writer.close();
  }

  private static void createNode(Gene o, BufferedWriter writer, Set<Gene> created) throws IOException
  {
    if (!created.contains(o))
    {
      writer.write(GraphML.createNodeData(o.getSymbol(), o.getSymbol(), Color.WHITE, 0, true));
      created.add(o);
    }
  }

  public static List<Gene[]> getCorrelatedModules(Gene[] genes, int[] sign, boolean[] pos, Gene probe)
  {
    double[][] correlation = getCorrelationMatrix(genes, sign, pos);

    double[] x = null;
    if (probe != null)
    {
      x = new double[correlation.length];
      for (int i = 0; i < x.length; i++)
      {
        x[i] = Pearson.calcCorrelation(probe.value, genes[i].value, pos);
      }
    }

    List<List<Integer>> moduleIndexes = null;

    if (x == null)
    {
      moduleIndexes = ClusterSpectra.analyzeInANewWay(correlation, x);//getModules(correlation, bin, avgCor);
    }
    else
    {
      moduleIndexes = new ArrayList<List<Integer>>();
      moduleIndexes.add(ClusterSpectra.getModuleOfMostPositivelyCorrelating(correlation, x));
      moduleIndexes.add(ClusterSpectra.getModuleOfMostNegativelyCorrelating(correlation, x));
    }

//    printModuleOVerlapMatrix(moduleIndexes);

    List<Gene[]> modules = new ArrayList<Gene[]>();

    for (List<Integer> moduleIndex : moduleIndexes)
    {
      Gene[] module = new Gene[moduleIndex.size()];

      int i = 0;
      for (Integer ind : moduleIndex)
      {
        module[i++] = genes[ind];
      }

      assert i == moduleIndex.size();

      modules.add(module);
    }

    return modules;
  }

  public static void printModuleScores(Gene[] genes, int[] sign, boolean[] pos, Gene[] modulator)
  {
    double[][] correlation = getCorrelationMatrix(genes, sign, pos);

    double[][] x = new double[modulator.length][correlation.length];

    for (int i = 0; i < modulator.length; i++)
    {
      for (int j = 0; j < correlation.length; j++)
      {
        x[i][j] = Pearson.calcCorrelation(modulator[i].value, genes[j].value, pos);
      }
    }

    List<Holder3> list = new ArrayList<Holder3>();


    for (int i = 0; i < modulator.length; i++)
    {
      List<Integer> posit = ClusterSpectra.getModuleOfMostPositivelyCorrelating(correlation, x[i]);
      List<Integer> negat = ClusterSpectra.getModuleOfMostNegativelyCorrelating(correlation, x[i]);

      double positCor = getAvgCor(x[i], posit);
      double negatCor = getAvgCor(x[i], negat);

      list.add(new Holder3(modulator[i], posit, negat, positCor, negatCor));


      if (modulator[i].getSymbol().equals("VDR") && ArrayUtils.countTrue(pos) == 83)
      {
        boolean[][] w = ArrayUtils.getSlidingWindow(modulator[i].value, pos, ArrayUtils.countTrue(pos) / 2);
        Gene[] module = cropGenes(genes, negat);
        for (int j = 0; j < w.length; j++)
        {
          System.out.println(j + "\t" + getAverageCorrelation(module, sign, w[j]));
        }

//        ClusterSpectra.growModuleFromSeed(correlation, new double[][]{x[i]}, new String[]{modulator[i].getSymbol()}, ClusterSpectra.getTopIndexes(x[i], 3));
      }
    }
    Collections.sort(list);
    for (Holder3 h : list)
    {
      if (h.score > 4) System.out.println(h);
    }

//    List<Gene[]> modules = new ArrayList<Gene[]>();
//
//    for (List<Integer> moduleIndex : moduleIndexes)
//    {
//      Gene[] module = new Gene[moduleIndex.size()];
//
//      int i = 0;
//      for (Integer ind : moduleIndex)
//      {
//        module[i++] = genes[ind];
//      }
//
//      assert i == moduleIndex.size();
//
//      modules.add(module);
//    }
  }

  static Gene[] cropGenes(Gene[] genes, List<Integer> module)
  {
    Gene[] crop = new Gene[module.size()];
    int j = 0;
    for (Integer i : module)
    {
      crop[j++] = genes[i];
    }
    return crop;
  }

  static class Holder3 implements Comparable
  {
    Gene mod;
    List<Integer> posit;
    List<Integer> negat;
    double positCor;
    double negatCor;
    Double score;

    Holder3(Gene mod, List<Integer> posit, List<Integer> negat, double positCor, double negatCor)
    {
      this.mod = mod;
      this.posit = posit;
      this.negat = negat;
      this.positCor = positCor;
      this.negatCor = negatCor;
      score = Math.max(posit.size() * Math.abs(positCor), negat.size() * Math.abs(negatCor));
    }

    public int compareTo(Object o)
    {
      return ((Holder3) o).score.compareTo(score);
    }

    @Override
    public String toString()
    {
      return mod.getSymbol() + "\t" + posit.size() + "\t" + positCor +
          "\t" + negat.size() + "\t" + negatCor;
    }
  }

  private static double getAvgCor(double[] cor, List<Integer> module)
  {
    double mean = 0;
    for (Integer i : module)
    {
      mean += cor[i];
    }
    mean /= module.size();
    return mean;
  }

  private static double[][] getCorrelationMatrix(Gene[] genes, int[] sign, boolean[] pos)
  {
    double[][] correlation = new double[genes.length][genes.length];
    int expSize = pos == null ? genes[0].value.length : ArrayUtils.countTrue(pos);

    for (int i = 0; i < genes.length-1; i++)
    {
      for (int j = i + 1; j < genes.length; j++)
      {
        double[][] val = new double[2][];
        val[0] = genes[i].value;
        val[1] = genes[j].value;

        double cor = pos == null ? Pearson.calcCorrelation(val) :
          Pearson.calcCorrelation(val, pos);

        double pv = Pearson.calcCorrSignificance(cor, expSize);

        if (pv < CORR_PV_THR)
        {
          correlation[i][j] = cor * sign[i] * sign[j];
          correlation[j][i] = correlation[i][j];
        }
      }
    }
    return correlation;
  }

  private static void printModuleOVerlapMatrix(List<List<Integer>> modules)
  {
    System.out.println("Module overlaps");
    for (List module : modules)
    {
      System.out.print("\t" + module.size());
    }
    for (List module : modules)
    {
      System.out.print("\n" + module.size());

      for (List mod : modules)
      {
        System.out.print("\t" + (mod == module ? "" : SetUtils.countCommon(mod, module)));
      }
    }
    System.out.println();
  }

  private static void printModuleOVerlapMatrix(List<List<Gene>> modules, int tarNum)
  {
    Map<Gene, Integer> cnt = new HashMap<Gene, Integer>();
    TermCounter tc = new TermCounter();
    System.out.println("Module overlaps");
    for (List<Gene> module : modules)
    {
      System.out.print("\t" + module.size());

      for (Gene g : module)
      {
        if (!cnt.containsKey(g)) cnt.put(g, 1);
        else cnt.put(g, cnt.get(g) + 1);
        tc.addTerm(g.getSymbol());
      }
    }
    for (List module : modules)
    {
      System.out.print("\n" + module.size());

      int com = 0;
      int msize = 0;
      for (List mod : modules)
      {
        com = SetUtils.countCommon(mod, module);
        msize = mod.size();
        System.out.print("\t" + (mod == module ? "" : com));
      }
      System.out.print("\t" + (com / ((module.size() * msize) / (double) tarNum)));
    }
    System.out.println();
//    tc.print();
   
  }

  private static List<Integer>[] getFrequencyDistribution(Map<Integer, Integer> count, Gene[] gene, double[][] correlation)
  {
    int maxFreq = 0;
    for (Integer id : count.keySet())
    {
      Integer cnt = count.get(id);
      if (maxFreq < cnt) maxFreq = cnt;

    }

    int totalBins = gene.length / AVG_BINS_PER_GENE;
    List<Integer>[] bin = new List[totalBins];
    for (int i = 0; i < bin.length; i++) bin[i] = new ArrayList<Integer>();

    for (Integer id : count.keySet())
    {
      Integer cnt = count.get(id);
      int i = (int) Math.floor((cnt / (double) maxFreq) * totalBins);
      if (i == totalBins) i--;
      bin[i].add(id);
    }

    for (int i = 0; i < bin.length; i++)
    {
      System.out.println(i + "\t" + bin[i].size());
    }

    return bin;
  }

  private static void printCorrelationDistribution(double[][] correlation, Set<Integer> ids)
  {
    Histogram h = new Histogram(0.1);
    for (Integer id1 : ids)
    {
      for (Integer id2 : ids)
      {
        if (id1 >= id2) continue;

        h.count(correlation[id1][id2]);
      }
    }
    System.out.println("Distribution of correlations inside top scored module");
    h.print();
  }

  private static void printAvgCorrelationDistribution(double[][] correlation, Set<Integer> ids)
  {
    Histogram h = new Histogram(0.05);
    Map<Integer, Double> avg = new HashMap<Integer, Double>();
    for (Integer id1 : ids)
    {
      avg.put(id1, 0.);

      for (Integer id2 : ids)
      {
        if (id1.equals(id2)) continue;
        avg.put(id1, avg.get(id1) + correlation[id1][id2]);
      }
      avg.put(id1,  avg.get(id1) / (ids.size() - 1));
      h.count(avg.get(id1));
    }
    System.out.println("Distribution of average correlations inside module");
    h.print();
  }

  private static double getAverageExpression(Gene[] gene)
  {
    double exp = 0;
    for (Gene g : gene)
    {
      for (double v : g.value)
      {
        exp += Math.exp(v);
      }
    }
    exp /= gene.length * gene[0].value.length;
    return exp;
  }

  private static double getAverageCorrelation(Gene[] gene, int [] sign, boolean[] pos)
  {
    double cor = 0;
    for (int i = 0; i < gene.length-1; i++)
    {
      for (int j = i+1; j < gene.length; j++)
      {
        cor += Pearson.calcCorrelation(gene[i].value, gene[j].value, pos) * sign[i] * sign[j];
      }
    }
    cor /= (gene.length * (gene.length - 1)) / 2;
    return cor;
  }

  private static void printInterAndIntraCorrAverages(double[][] correlation,
    Set<Integer> ids1, Set<Integer> ids2)
  {
    double inter1 = 0;
    double inter2 = 0;
    double intra = 0;

    for (Integer i : ids1) for (Integer j : ids1) if (i < j) inter1 += correlation[i][j];
    for (Integer i : ids2) for (Integer j : ids2) if (i < j) inter2 += correlation[i][j];
    for (Integer i : ids1) for (Integer j : ids2) intra += correlation[i][j];

    inter1 /= ids1.size() * ids1.size() / 2;
    inter2 /= ids2.size() * ids2.size() / 2;
    intra /= ids1.size() * ids2.size();

    System.out.println("inter1 = " + inter1);
    System.out.println("inter2 = " + inter2);
    System.out.println("intra = " + intra);
  }

  private static int[] getSigns(Gene[] genes)
  {
    int[] signs = new int[genes.length];
    TabDelimitedFileParser p = new TabDelimitedFileParser("resource/factors/AR_andr_small.txt");
    Map<String, String> score = p.getOneToOneMap("Target", "Score");
    for (int i = 0; i < genes.length; i++)
    {
      signs[i] = score.get(genes[i].getSymbol()).startsWith("-") ? -1 : 1;
    }
    return signs;
  }

  private static List<Integer> getCluster(double[][] correlation, int ind1, int ind2,
    double scoreAdditionThr)
  {
    int size = correlation.length;
    List<Integer> list = new ArrayList<Integer>();
    list.add(ind1);
    list.add(ind2);

    while(true)
    {
      double maxIncrease = 0;
      int indexOfMax = -1;

      for (int i = 0; i < size; i++)
      {
        if (list.contains(i)) continue;

        double addition = 0;

        for (Integer ind : list)
        {
          addition += correlation[i][ind];
        }

        if (addition > maxIncrease)
        {
          maxIncrease = addition;
          indexOfMax = i;
        }
      }

      if (indexOfMax == -1) break;

      if (maxIncrease / list.size() > scoreAdditionThr)
      {
        list.add(indexOfMax);
      }
      else break;
    }
    return list;
  }

  public static double getAverageAbsoluteCor(double [][] c)
  {
    double sum = 0;
    int cnt = 0;
    for (int i = 0; i < c.length-1; i++)
    {
      for (int j = i+1; j < c.length; j++)
      {
        double x = Math.abs(c[i][j]);
        sum += x;
        cnt++;
      }
    }
    return sum / cnt;
  }

  //----------- Identifying groups --------------------------------------------------------------|

  // d for distribution
  private static List<List<Integer>> getModules(double[][] correlation, List<Integer>[] d,
    double minCorAdd)
  {
    List<List<Integer>> modules = new ArrayList<List<Integer>>();

    boolean[] p = getPeaks(d);
    for (int i = 0; i < p.length; i++)
    {
      // ignore the low scoring peaks
      if (i < p.length / 4D) continue;

      if (p[i])
      {
        List<Integer> module = getModuleSeed(d[i], correlation, minCorAdd);
        if (module.size() < 3) continue;

        int j = 1;

        while (i-j >= 0 || i+j < p.length)
        {
          if (i + j < p.length) enlargeModule(correlation, module, d[i + j], minCorAdd);
          if (i - j >= 0) enlargeModule(correlation, module, d[i - j], minCorAdd);
          j++;
        }
        if (module.size() >= MODULE_SIZE_THR) modules.add(module);
      }
    }

    return modules;
  }

  private static List<Integer> getModuleSeed(List<Integer> ids, double [][] correlation,
    double minCorAdd)
  {
    // Do not overwrite peak set
    ids = new ArrayList<Integer>(ids);

    while (true)
    {
      double minScore = Double.MAX_VALUE;
      Integer minID = -1;

      for (Integer i : ids)
      {
        double score = 0;

        for (Integer j : ids)
        {
          if (i.equals(j)) continue;
          score += correlation[i][j];
        }
        score /= ids.size() - 1;

        if (score < minScore)
        {
          minScore = score;
          minID = i;
        }
      }

      if (minScore < minCorAdd)
      {
        ids.remove(minID);
      }
      else break;
    }
    return ids;
  }

  private static void enlargeModule(double [][] correlation, List<Integer> module,
    List<Integer> cand, double minCorAdd)
  {
    // do not modify original candidates list
    cand = new ArrayList<Integer>(cand);

    while (!cand.isEmpty())
    {
      double maxScore = -Double.MAX_VALUE;
      Integer maxInd = -1;

      for (Integer i : cand)
      {
        double score = 0;
        for (Integer j : module)
        {
          assert !i.equals(j);
          score += correlation[i][j];
        }
        score /= module.size();

        if (score > maxScore)
        {
          maxScore = score;
          maxInd = i;
        }
      }

      if (maxScore > minCorAdd)
      {
        module.add(maxInd);
        cand.remove(maxInd);
      }
      else break;
    }
  }

  private static boolean[] getPeaks(List<Integer>[] distr)
  {
    boolean[] p = new boolean[distr.length];
    int[] s = new int[distr.length];
    int[] r = new int[distr.length];
    int[] l = new int[distr.length];

    for (int i = 0; i < p.length; i++) s[i] = distr[i].size();

    for (int i = 0; i < p.length; i++)
    {
      if (i < p.length - 1) r[i] = s[i] < s[i+1] ? -1 : s[i] > s[i+1] ? 1 : 0;
      else r[i] = s[i] > 0 ? 1 : 0;

      if (i > 0) l[i] = s[i] < s[i-1] ? -1 : s[i] > s[i-1] ? 1 : 0;
      else l[i] = s[i] > 0 ? 1 : 0;
    }

    boolean goingStraight = false;

    for (int i = 0; i < p.length; i++)
    {
      if(r[i] == 1 && l[i] == 1)
      {
        p[i] = true;
        goingStraight = false;
      }
      else if (r[i] == 0 && l[i] == 1)
      {
        goingStraight = true;
        p[i] = false;
      }
      else if (r[i] == 0 && l[i] == 0 && goingStraight)
      {
        p[i] = false;
      }
      else if (r[i] == 1 && l[i] == 0 && goingStraight)
      {
        p[i] = true;
        goingStraight = false;
      }
      else
      {
        p[i] = false;
        goingStraight = false;
      }
    }
    return p;
  }

  private static void mergeSimilarModules(List<List<Integer>> modules)
  {
    for (List<Integer> module1 : new ArrayList<List<Integer>>(modules))
    {
      for (List<Integer> module2 : new ArrayList<List<Integer>>(modules))
      {
        if (module1 == module2) continue;
        int com = SetUtils.countCommon(module1, module2);
        if (com / (double) module1.size() > .8 && com / (double) module2.size() > .8)
        {
          for (Integer id : module1)
          {
            if (!module2.contains(id)) module2.add(id);
          }
          modules.remove(module1);
        }
      }
    }
  }

  //---------------------------------------------------------------------------------------------|

  public static void printResistanceGeneEnrichment(Gene[] tar, Gene[] mod, int[] sign)
  {
    TabDelimitedFileParser p = new TabDelimitedFileParser("resource/factors/AR-resistance.txt");
    Set<String> resSet = p.getColumnSet(0);
    Map<Gene, Set<String>> corMap = new HashMap<Gene, Set<String>>();
    for (int i = 0; i < mod.length; i++)
    {
      for (int j = 0; j < tar.length; j++)
      {
        double cor = Pearson.calcCorrelation(mod[i].value, tar[j].value);
        double pv = Pearson.calcCorrSignificance(cor, mod[i].value.length);
        if (pv > CORR_PV_THR)
        {
          if (!corMap.containsKey(mod[i])) corMap.put(mod[i], new HashSet<String>());

          corMap.get(mod[i]).add(tar[j].getSymbol());
        }
      }
    }
    Histogram h = new Histogram(0.1);
    for (int i = 0; i < mod.length; i++)
    {
      int com = SetUtils.countCommon(resSet, corMap.get(mod[i]));
      double ratio = com / (double) corMap.get(mod[i]).size();
      System.out.println(mod[i].getSymbol() + "\t" + com + "\t" + corMap.get(mod[i]).size() +
        "\t" + ratio);
      h.count(ratio);
    }
    h.print();
  }

  public static void takeLogIfNeeded(Gene[] gene)
  {
    boolean needed = false;
    for (double v : gene[0].value)
    {
      if (v > 20)
      {
        needed = true;
        break;
      }
    }
    if (needed)
    {
      for (int i = 0; i < gene.length; i++)
      {
        for (int j = 0; j < gene[i].value.length; j++)
        {
          gene[i].value[j] = Math.log(gene[i].value[j]);
        }
      }
      System.out.println("Took log of expressions");
    }
  }

  public static List<Gene> getGRResponseModule(Gene[] tars)
  {
    TabDelimitedFileParser p = new TabDelimitedFileParser("resource/resistance_resp1.txt");
    Set<String> resp = p.getColumnSet(0);
    List<Gene> list = new ArrayList<Gene>();

    for (Gene tar : tars)
    {
      if (resp.contains(tar.getSymbol())) list.add(tar);
    }
    return list;
  }

  public static final Set<String> GR_TARG = new HashSet<String>(Arrays.asList((
    "SLC45A3\n" +
      "FKBP5\n" +
      "CBLL1\n" +
      "TMEM140\n" +
      "CXCR7\n" +
      "INSIG1\n" +
      "SLC35F2\n" +
      "STK39\n" +
      "PPAP2A\n" +
      "ACSL3\n" +
      "TSKU\n" +
      "CENPN\n" +
      "SGK1\n" +
      "KLK2\n" +
      "GPER\n" +
      "PAK1IP1\n" +
      "UBE2J1\n" +
      "PPFIBP2\n" +
      "ATAD2\n" +
      "PRKCH\n" +
      "FZD5\n" +
      "DBI\n" +
      "HOMER2\n" +
      "TMEM38B\n" +
      "MPHOSPH9\n" +
      "GRB10\n" +
      "HOXC13\n" +
      "TNFAIP8\n" +
      "KCNN2\n" +
      "LRRN1\n" +
      "ST7").split("\n")));
}
TOP

Related Classes of gem.ClusterAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.