package gem;
import gem.parser.TabDelimitedFileParser;
import gem.util.*;
import java.io.*;
import java.util.*;
/**
* @author Ozgun Babur
*/
public class VarianceAnalyzer implements Constants
{
public static void main(String[] args) throws Throwable
{
// printSummaryFor("RUNX1", "SLC8A1", "HIPK2", "DNAH1", "STAT1", "AR", "CD44", "EGFR", "FOXP2");
// filterExpressionData("resource/expdata/expo");
test6();
}
// Distribution of variance
private static void test6() throws IOException
{
double range = 0.05;
Histogram h1 = new Histogram(range);
Histogram h2 = new Histogram(range);
Histogram h3 = new Histogram(range);
Histogram2D h2D1 = new Histogram2D(range);
Histogram2D h2D1sub = new Histogram2D(range);
Histogram2D h2D2 = new Histogram2D(range);
Histogram2D h2D2sub = new Histogram2D(range);
Set<Double> d1 = new HashSet<Double>();
Set<Double> d2 = new HashSet<Double>();
Set<Point> p1 = new HashSet<Point>();
Set<Point> p2 = new HashSet<Point>();
Set<Point> p3 = new HashSet<Point>();
// String dir = "resource/expdata/GSE1133-GPL96/";
// String dir = "resource/expdata/GSE13159/";
// String dir = "resource/expdata/GSE7307/";
String dir = "resource/expdata/expo_select/";
TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "platform.txt");
Map<String, String> id2sym = parser.getOneToOneMap("ID", "Gene Symbol");
parser = new TabDelimitedFileParser("resource/phospho-site/ModifCounts.txt");
Map<String, String> sym2modif = parser.getOneToOneMap("Gene", "ModifCnt");
String[] tisIndex = getTissueIndexes();
// Progress prg = new Progress(54675);
Progress prg = new Progress(19800);
BufferedReader reader = new BufferedReader(new FileReader(dir + "data.txt"));
String line = reader.readLine();
int k = 0;
for (line = reader.readLine(); line != null; line = reader.readLine())
{
k++;
prg.tick();
String id = line.substring(0, line.indexOf("\t"));
String sym = id2sym.get(id);
boolean hasGeneSymbol = sym != null && sym.trim().length() > 0 && !sym.contains("/");
int modif = (hasGeneSymbol && sym2modif.containsKey(sym)) ?
Integer.parseInt(sym2modif.get(sym)) : 0;
line = line.substring(line.indexOf("\t") + 1);
double[] vals = parseValues(line);
// for (int i = 0; i < vals.length; i++) vals[i] = Math.pow(Math.E, vals[i]);
Gene gene = new Gene(sym, id, vals.length);
gene.value = vals;
gene.rankAdjustStatus(1D/3);
// double spe1 = calcSPEByMeans(gene, tisIndex);
double spe2 = calcSPEByRatios(gene, tisIndex);
double dev1 = Summary.stdev(vals);
double mean1 = Summary.mean(vals);
double dev3 = dev1 / mean1;
dev1 = Math.log(dev1);
mean1 = Math.log(mean1);
// if (spe1 > 2 && spe2 > 0.5 && mean1 > 8) System.out.println("high: " + sym);
// if (spe1 < 0.15 && spe2 < 0.15 && mean1 > 7) System.out.println("low: " + sym);
for (int i = 0; i < vals.length; i++)
{
vals[i] = Math.log(vals[i]);
}
double dev2 = Summary.stdev(vals);
double mean2 = Summary.mean(vals);
// h2D1.count(dev2, dev3);
if (hasGeneSymbol)
{
// if (spe2 < 0.197)
// {
// h1.count(dev2);
// d1.add(dev2);
// }
// else
// {
// h2.count(dev2);
// d2.add(dev2);
// }
p3.add(new Point(dev1, dev2, modif > 0 ? 1. : 0.));
h2D1.count(mean1, dev1);
h2D2.count(mean1, dev2);
// if (dev3 > 1)
// if (spe2 > 0.4)
// if (mean1 > 6)
if (modif > 0)
// if (hasGeneSymbol)
{
h2D1sub.count(mean1, dev1);
h2D2sub.count(mean1, dev2);
// p2.add(new Point(dev1, dev2));
}
// else p1.add(new Point(dev1, dev2));
}
}
reader.close();
System.out.println("lines = " + k);
Kolmogorov.plotMovingD(p3, 3, 9, 1, 0.1);
// h2D1.plot(false);
// h2D2.plot(false);
Histogram2DPlot pl = new Histogram2DPlot(h2D1, h2D1sub);
pl.setVisible(true);
pl = new Histogram2DPlot(h2D2, h2D2sub);
pl.setVisible(true);
// Histogram.printAll(h1, h2, h3);
// h1.printTogether(h2);
// h1.printDensity();
// System.out.println("----");
// h2.printDensity();
// System.out.println("----");
// h3.printDensity();
// double d = Kolmogorov.calcDStat(d1, d2);
// System.out.println("d1.size() = " + d1.size());
// System.out.println("d2.size() = " + d2.size());
// System.out.println("d stat = " + d);
// System.out.println("p-val = " + Kolmogorov.calcPval(d, d1.size(), d2.size()));
}
private static String[] getTissueIndexes() throws IOException
{
String firstLine = FileUtil.getFirstLine("resource/expdata/expo/data.txt");
firstLine = firstLine.substring(firstLine.indexOf("\""));
String[] expNames = firstLine.split("\t");
for (int i = 0; i < expNames.length; i++)
{
expNames[i] = expNames[i].substring(1, expNames[i].length() - 1);
}
TabDelimitedFileParser parser = new TabDelimitedFileParser("resource/celltypes_expO.txt");
Map<String, String> exp2tiss = parser.getOneToOneMap("Sample", "Tissue");
for (String exp : new HashSet<String>(exp2tiss.keySet()))
{
// exp2tiss.put(exp, exp2tiss.get(exp).substring(0, exp2tiss.get(exp).indexOf(" ")).toLowerCase());
exp2tiss.put(exp, exp2tiss.get(exp).toLowerCase());
}
String[] index = new String[expNames.length];
for (int i = 0; i < index.length; i++)
{
index[i] = exp2tiss.get(expNames[i]);
}
return index;
}
private static double getTissueSpecificity(Gene gene, String[] tissName)
{
Map<String, int[]> count = new HashMap<String, int[]>();
Map<String, Set<Double>> values = new HashMap<String, Set<Double>>();
double globalMean = 0;
for (int i = 0; i < gene.status.length; i++)
{
globalMean += gene.value[i];
if (!count.containsKey(tissName[i])) count.put(tissName[i], new int[] {0, 0});
if (!values.containsKey(tissName[i])) values.put(tissName[i], new HashSet<Double>());
values.get(tissName[i]).add(gene.value[i]);
if (gene.status[i] != MARGINAL)
{
count.get(tissName[i])[gene.status[i] == PRESENT ? 1 : 0]++;
}
}
globalMean = globalMean / gene.value.length;
double maxMetric = -Double.MAX_VALUE;
Set<Double> metrics = new HashSet<Double>();
for (String tis : values.keySet())
{
Set<Double> vals = values.get(tis);
if (vals.size() < 20) continue;
int over = 0;
for (Double val : vals)
{
if (val > globalMean) over++;
}
double metric = over / (double) vals.size();
//-------
// int ca = count.get(tis)[0];
// int cp = count.get(tis)[1];
//
// double total = ca + cp;
// if (total < 20) continue;
//
// double mean = cp / total;
//-------
//-------
// Set<Double> vals = values.get(tis);
//
// if (vals.size() < 20) continue;
//
// double mean = 0;
// for (Double v : vals) mean += v;
// mean /= vals.size();
// mean = Math.log(mean);
//-------
metrics.add(metric);
if (metric > maxMetric) maxMetric = metric;
}
double dif = 0;
for (Double metric : metrics)
{
dif += maxMetric - metric;
}
return dif / metrics.size();
// return maxMean - minMean;
// int leftSurplus = 0;
// int rightSurplus = 0;
// double ent = 0;
// for (String tiss : count.keySet())
// {
// double p = count.get(tiss)[1] / (double) (count.get(tiss)[1] + count.get(tiss)[0]);
// if (p > 0.0000001 && p < 0.9999999)
// {
// ent += - p * Math.log(p);
// }
// }
// return ent;
// {
// if (count.get(tiss)[1] + count.get(tiss)[0] < 10) continue;
//
// int dif = count.get(tiss)[1] - count.get(tiss)[0];
// if (dif > 0) rightSurplus += dif;
// else leftSurplus -= dif;
// }
// return Math.log(leftSurplus * rightSurplus);
}
private static double calcSPEByEntropy(Gene gene, String[] tissName)
{
Map<String, int[]> count = new HashMap<String, int[]>();
for (int i = 0; i < gene.status.length; i++)
{
if (!count.containsKey(tissName[i])) count.put(tissName[i], new int[] {0, 0});
if (gene.status[i] != MARGINAL)
{
count.get(tissName[i])[gene.status[i] == PRESENT ? 1 : 0]++;
}
}
double ent = 0;
for (String tiss : count.keySet())
{
double p = count.get(tiss)[1] / (double) (count.get(tiss)[1] + count.get(tiss)[0]);
if (p > 0.0000001 && p < 0.9999999)
{
ent += - p * Math.log(p);
}
}
return ent;
}
static double calcSPEByMeans(Gene gene, String[] tissName)
{
Map<String, Set<Double>> values = getTissueValues(gene, tissName);
double maxMetric = -Double.MAX_VALUE;
Map<String, Double> metrics = new HashMap<String, Double>();
for (String tis : values.keySet())
{
Set<Double> vals = values.get(tis);
if (vals.size() < 20) continue;
double sum = 0;
for (Double val : vals)
{
sum += val;
}
double metric = Math.log(sum / vals.size());
metrics.put(tis, metric);
if (metric > maxMetric) maxMetric = metric;
}
// if (gene.geneid.equals("SLC3A1")) printTissueMetrics(gene, metrics);
double dif = 0;
for (Double metric : metrics.values())
{
dif += maxMetric - metric;
}
return dif / (metrics.size() - 1);
}
static double calcSPEByRatios(Gene gene, String[] tissName)
{
Map<String, Set<Double>> values = getTissueValues(gene, tissName);
double mid = Summary.median(gene.value);
double maxMetric = -Double.MAX_VALUE;
Map<String, Double> metrics = new HashMap<String, Double>();
for (String tis : values.keySet())
{
Set<Double> vals = values.get(tis);
if (vals.size() < 20) continue;
int over = 0;
for (Double val : vals)
{
if (val > mid) over++;
}
double metric = over / (double) vals.size();
metrics.put(tis, metric);
if (metric > maxMetric) maxMetric = metric;
}
// if (gene.geneid.equals("SLC3A1")) printTissueMetrics(gene, metrics);
double dif = 0;
for (Double metric : metrics.values())
{
dif += maxMetric - metric;
}
return dif / (metrics.size() - 1);
}
static private void printTissueMetrics(Gene gene, Map<String, Double> values)
{
System.out.println("\n" + gene.geneid);
for (String tis : values.keySet())
{
System.out.println(tis + " " + values.get(tis));
}
}
private static Map<String, Set<Double>> getTissueValues(Gene gene, String[] tissName)
{
Map<String, Set<Double>> values = new HashMap<String, Set<Double>>();
for (int i = 0; i < gene.status.length; i++)
{
if (!values.containsKey(tissName[i]))
values.put(tissName[i], new HashSet<Double>());
values.get(tissName[i]).add(gene.value[i]);
}
return values;
}
/**
* Leaves one row per gene symbol. Selects rows with max mean expression.
* @param dir
*/
private static void filterExpressionData(String dir) throws IOException
{
TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "/platform.txt");
Map<String, String> id2sym = parser.getOneToOneMap("ID", "Gene Symbol");
Map<String, String> sym2row = new HashMap<String, String>();
Map<String, Double> sym2mean = new HashMap<String, Double>();
Map<String, Integer> sym2cnt = new HashMap<String, Integer>();
Progress prg = new Progress(54675);
BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));
String outDir = dir + "_select";
File file = new File(outDir);
file.mkdir();
BufferedWriter writer = new BufferedWriter(new FileWriter(outDir + "/data.txt"));
String line = reader.readLine();
writer.write(line);
for (line = reader.readLine(); line != null; line = reader.readLine())
{
prg.tick();
String id = line.substring(0, line.indexOf("\t"));
String sym = id2sym.get(id);
boolean hasGeneSymbol = sym != null && sym.trim().length() > 0 && !sym.contains("/");
if (hasGeneSymbol)
{
if (sym2cnt.containsKey(sym)) sym2cnt.put(sym, sym2cnt.get(sym) + 1);
else sym2cnt.put(sym, 1);
double[] val = parseValues(line.substring(line.indexOf("\t") + 1));
double mean = Summary.mean(val);
if (!sym2row.containsKey(sym) || sym2mean.get(sym) < mean)
{
sym2mean.put(sym, mean);
sym2row.put(sym, line);
}
}
}
reader.close();
for (String row : sym2row.values())
{
writer.write("\n" + row);
}
writer.close();
Histogram h = new Histogram(1);
for (String sym : sym2cnt.keySet())
{
int cnt = sym2cnt.get(sym);
h.count(cnt);
if (cnt > 7)
{
System.out.println(sym + "\t" + cnt);
}
}
h.print();
}
private static double[] parseValues(String line)
{
String[] token = line.split("\t");
double[] val = new double[token.length];
for (int i = 0; i < token.length; i++)
{
val[i] = Double.parseDouble(token[i]);
}
return val;
}
private static void printSummaryFor(String... sym) throws IOException
{
Map<String, List<double[]>> map = getValuesOf(sym);
for (String s : map.keySet())
{
System.out.println("\n" + s);
List<double[]> vals = map.get(s);
double[] x = new double[vals.size()];
double[] y = new double[vals.size()];
int i = 0;
for (double[] val : vals)
{
double mean = Math.log(Summary.mean(val));
double logdev = Math.log(Summary.stdev(val));
double devlog = Summary.stdev(Summary.log(val));
x[i] = mean;
y[i++] = logdev;
System.out.println(mean + "\t" + logdev + "\t" + devlog);
}
double[] r = LinearRegression.regress(x, y);
System.out.println("a = " + r[0]);
System.out.println("r2 = " + r[2]);
}
}
private static Map<String, List<double[]>> getValuesOf(String[] sym) throws IOException
{
Map<String, List<double[]>> map = new HashMap<String, List<double[]>>();
String dir = "resource/expdata/expo/";
TabDelimitedFileParser parser = new TabDelimitedFileParser(dir + "platform.txt");
Map<String, Set<String>> sym2id = parser.getOneToManyMap("Gene Symbol", "ID");
Map<String, String> id2sym = new HashMap<String, String>();
for (String s : sym)
{
if (!sym2id.containsKey(s)) continue;
for (String id : sym2id.get(s))
{
id2sym.put(id, s);
}
}
Set<String> ids = id2sym.keySet();
BufferedReader reader = new BufferedReader(new FileReader(dir + "/data.txt"));
for (String line = reader.readLine(); line != null; line = reader.readLine())
{
String rowID = line.substring(0, line.indexOf("\t"));
if (ids.contains(rowID))
{
String s = id2sym.get(rowID);
if (!map.containsKey(s)) map.put(s, new ArrayList<double[]>());
map.get(s).add(parseValues(line.substring(line.indexOf("\t") + 1)));
}
}
return map;
}
}