package gem;
import gem.util.Kronometre;
import gem.util.Progress;
import java.util.*;
/**
* Main class to run the GEM implementation. Make sure that the constant <code>TRIPLET_FILE</code>
* points to the hypothesis file, where each row contains Entrez Gene IDs of triplets. The false
* discovery rate is controlled by the <code>FDR</code> constant. The constant <code<MINVAR</code>
* is for filtering genes with low variance in the expression dataset. Any triplet hypothesis
* containing a gene with variance under this threshold is filtered out.
*/
public class GEM implements Constants
{
/**
* Names of expression dataset files.
*/
private static final String EXPFILE[] = {
// "resource/exp_big_1.txt", "resource/exp_big_2.txt", "resource/exp_big_3.txt", "resource/exp_big_4.txt"};
// "resource/experiments_expO_1.txt", "resource/experiments_expO_2.txt"};
"resource/expop/expop_1.txt", "resource/expop/expop_2.txt", "resource/expop/expop_3.txt"};
// "resource/exp_Treg_1.exp", "resource/exp_Treg_2.exp", "resource/exp_Treg_3.exp"};
/**
* Flag to randomize the order of experiments for a gene. This is used for a randomization test.
*/
private static final boolean RND = false;
/**
* Flag to run the analysis only on some triplets, which are tagged as debug.
*/
private static final boolean DEBUG_TRIP = false;
private static final boolean KEEP_OR = false;
public static final String FACNAME = "AR";
/**
* File for initial triplet hypotheses. This is a tab-delimited text file, each row containing
* an hypothesis. Entrez Gene IDs of M, F, and T are separated by tabs.
*/
private static String TRIPLET_FILE = "resource/factor-trips/" + FACNAME + "-select.txt";
// private static String TRIPLET_FILE = "resource/AR-hprd-tredlit-trips.txt";
// private static String TRIPLET_FILE = "resource/triplets.txt";
// private static String TRIPLET_FILE = "resource/NR3C1-AR-trips.txt";
// private static String TRIPLET_FILE = "resource/tartrips/triplets_tar_mod-mod_all.txt";
// private static String TRIPLET_FILE = "resource/triplets_FOXP3.txt";
// These sets of M, F, and T are used for debugging, for filtering triplets to the desired
// M, F, or T.
private static String[] MODULATOR = new String[]{};//"NR3C1"};
private static String[] FACTOR = new String[]{};//"AR"};
private static String[] TARGET = new String[]{};//"NOL3"};
/**
* Desired false discovery rate.
*/
private static final double FDR = 0.05;
/**
* Variance threshold for genes. Any triplet that contains a gene with a lower variance will be
* filtered out.
*/
public static final double MINVAR = 10;
public static final double MINVARLOG = 0;
/**
* Name of the result file.
*/
private static String RESULT_FILE = "result/Result" +
"_fdr" + FDR + "_var" + MINVAR + (RND ? "_rnd" : "") + "_" + FACNAME +
(MODULATOR.length > 0 ? "_M_" + MODULATOR[0] : "") +
(FACTOR.length > 0 ? "_F_" + FACTOR[0] : "") +
(TARGET.length > 0 ? "_T_" + TARGET[0] : "") +
"_expo_select.xls";
/**
* Method that runs the analysis.
*/
public void run() throws Throwable
{
// Load experiments
List<Triplet> trips = Triplet.readTrips(TRIPLET_FILE);
// Continue with desired M, F, and T if a selection was given
if (TARGET.length > 0) trips = Triplet.selectTargets(trips, Arrays.asList(TARGET));
if (FACTOR.length > 0) trips = Triplet.selectFactors(trips, Arrays.asList(FACTOR));
if (MODULATOR.length > 0) trips = Triplet.selectModulators(trips, Arrays.asList(MODULATOR));
// Remove ETV1-AR interaction from triplets. This is a false positive entry in HPRD. We
// noticed it while investigating our results. GEM reported ETV1 as a strong modulator of
// AR, but we could not find any evidence in the literature. Then we investigated the
// evidence of ETV1-AR interaction in the HPRD database, and realized that the related paper
// in HPRD databse is actually not related to ETV1. Dr. Renu Goel from HPRD approved that
// ETV1-AR interaction is an incorrect entry in HPRD database.
Iterator<Triplet> iter = trips.iterator();
while(iter.hasNext())
{
Triplet t = iter.next();
if (t.modulator.equals("2119") && t.factor.equals("367"))
{
iter.remove();
}
}
System.out.println("triplets initial size = " + trips.size());
System.out.println("initial unique F = " + countUniqueF(trips));
System.out.println("initial unique M-F = " + countUniqueMF(trips));
System.out.println("initial unique F-T = " + countUniqueFT(trips));
// // Gets Entrez Gene IDs of genes in triplets. This is used for avoiding unecessary reading
// // of expression data.
// Set<String> geneIDs = Triplet.getGeneIDs(trips);
//
// System.out.println(geneIDs.size() + " genes present in triplets");
//
// // Read expression dataset
// Map<String, List<Gene>> geneMap = Gene.readGenes(geneIDs, MINVAR, EXPFILE);
//
//// System.out.print("Keeping only most varied ... ");
//// Gene.keepMostVaried(geneMap);
//// System.out.println("ok");
//
// // Report number of genes and experiments read.
//
// int expsize = geneMap.values().iterator().next().iterator().next().getExpSize();
// System.out.println("Experiments have " + geneMap.size() + " genes and " +
// expsize + " samples.");
//
// // Continue with debug triplets if DEBUG_TRIP flag is on
// if (DEBUG_TRIP) trips = Triplet.getDebug(trips);
//
// System.out.println("Triplets size = " + trips.size());
//
// // Randomize experiments for a randomization test if RND flag is on
// if (RND) Gene.randomize(geneMap.values());
//
// // Associate triplets to the experiment data read
// trips = Gene.associateExpsToTrips(geneMap, trips);
// geneMap = null;
trips = ExpDataReader.associate(trips, "resource/expdata/expo", MINVAR, MINVARLOG);
CellTypeMatcher.filterToGender(Triplet.collectGenes(trips), true);
printTripStats(trips, "After filtering and multiplying");
// If the above processing didn't leave any triplet, return
if (trips.isEmpty()) return;
// Assign discrete values to genes for each experiment, i.e., low, null, and high
// boolean[] ignore = CellTypeMatcher.getNonProstateNonExpo();
// CaseCounter.count(trips, 1D / 3, ignore);
CaseCounter.count(trips, 1D / 3);
// trips = Triplet.filterToStateExistence(trips);
// Difference.assignGammaPval(trips);
// double pv_thr = Triplet.getPvalGammaThreshold(trips, FDR);
// System.out.println("First for fdr " + FDR + " = " + pv_thr);
// trips = Triplet.filterToPvalGamma(trips, pv_thr);
// If any M-F state of a triplet is not represented in the discretized data,
// fileter out these triplets
trips = Triplet.filterToStateExistence(trips);
// Calculate gamma
Difference.assignGammaPval(trips);
// Find the gamma significance thresold for the desired FDR
double pv_thr = Triplet.getPvalGammaThreshold(trips, FDR);
System.out.println("Threshold gamma significance for fdr " + FDR + " = " + pv_thr);
// Filter out triplets with insignificant gamma
trips = Triplet.filterToPvalGamma(trips, pv_thr);
// Assign modulation categories
TripletClassifier.assignClass(trips, 0.05);
// Set<Gene> absentMods = getAbsentMods(trips);
// trips = Difference.assignBestCountsConsiderAbsent(trips, absentMods);
// Tester.calcDependencyMatrix(trips, pv_thr);
// Tester.calcIndependencyRatios(trips);
//-------------------
// printTripStats(trips, "After gamma filtering");
//
// // Calculate betaM
// Difference.assignBetaMPval(trips);
//
// // Find the threshold betaM pval for the desired FDR
// double pvalBetaMThr = Triplet.getPvalBetaMThreshold(trips, FDR);
//
// System.out.println("Threshold betaM significance for fdr " + FDR + " = " + pvalBetaMThr);
//
// // Filter out triplets where alphaM / betaM >= 1
//
// trips = Triplet.filterToPvalBetaM(trips, pvalBetaMThr, KEEP_OR);
// printTripStats(trips, "After betaM filtering");
//
// trips = Triplet.filterToLogicalAND(trips);
// printTripStats(trips, "After logical-or filtering");
// Check the monotonicity of the dependency function, assumed during discretization. Filter
// out non-monotonic triplets.
// trips = Triplet.filterToMonotonic(trips);
// printTripStats(trips, "After monotonicity filtering");
//------------------------
String[] pos = CellTypeMatcher.getExpoGenderFilteredTissueNamesArray(true);
for (Triplet t : trips)
{
if (t.modulator.equals("207") && t.target.equals("7113"))
CaseCounter.printTissuesInCounts(t, pos);
}
printTripStats(trips, "After everything");
if (!trips.isEmpty())
{
//Record significant modulations to the result file
Triplet.record(trips, RESULT_FILE);
}
}
private static Set<Gene> getAbsentMods(List<Triplet> trips)
{
Set<String> names = new HashSet<String>(Arrays.asList("AHR", "CASP1", "CDK6", "EFCAB6", "FHL2", "FLNA", "GRIP1", "IFI16", "NR3C1", "PAK6", "RUNX1", "TCF4", "TGFB1I1"));
// Set<String> names = new HashSet<String>(Arrays.asList("EFCAB6", "GRIP1", "PAK6", "RUNX1", "TGFB1I1"));
Set<Gene> absent = new HashSet<Gene>();
for (Triplet t : trips)
{
String sym = t.getMSym();
if (names.contains(sym)) absent.add(t.M);
}
return absent;
}
/**
* Prints some statistics about the list of triplet hypothesis to the console.
*
* @param trips
* @param msg
*/
private static void printTripStats(List<Triplet> trips, String msg)
{
Set<String> entrezRSet = new HashSet<String>();
Set<String> gbRSet = new HashSet<String>();
Set<String> entrezFSet = new HashSet<String>();
Set<String> gbFSet = new HashSet<String>();
Set<String> entrezTSet = new HashSet<String>();
Set<String> gbTSet = new HashSet<String>();
for (Triplet t : trips)
{
entrezRSet.add(t.M.geneid);
gbRSet.add(t.M.getGenBank());
entrezFSet.add(t.F.geneid);
gbFSet.add(t.F.getGenBank());
entrezTSet.add(t.T.geneid);
gbTSet.add(t.T.getGenBank());
}
System.out.println("\n-----" + msg + "---");
System.out.println("size of trips = " + trips.size());
System.out.println("Entrez Gene Mods = " + entrezRSet.size());
System.out.println("GenBank regs = " + gbRSet.size());
System.out.println("Entrez Gene Factors = " + entrezFSet.size());
System.out.println("GenBank Factors = " + gbFSet.size());
System.out.println("Entrez Gene Targets = " + entrezTSet.size());
System.out.println("GenBank Targets = " + gbTSet.size());
System.out.println("Unique M-F = " + countUniqueMF(trips));
System.out.println("Unique F-T = " + countUniqueFT(trips));
System.out.println("Unique M-F-T = " + countUniqueMFT(trips));
System.out.println("--------\n");
}
private static int countUniqueF(List<Triplet> trips)
{
Set<String> rfSet = new HashSet<String>();
for (Triplet t : trips)
{
rfSet.add(t.factor);
}
return rfSet.size();
}
private static int countUniqueMF(List<Triplet> trips)
{
Set<String> rfSet = new HashSet<String>();
for (Triplet t : trips)
{
rfSet.add(t.modulator + "-" + t.factor);
}
return rfSet.size();
}
private static int countUniqueFT(List<Triplet> trips)
{
Set<String> rfSet = new HashSet<String>();
for (Triplet t : trips)
{
rfSet.add(t.factor + "-" + t.target);
}
return rfSet.size();
}
private static int countUniqueMFT(List<Triplet> trips)
{
Set<String> rftSet = new HashSet<String>();
for (Triplet t : trips)
{
rftSet.add(t.modulator + "-" + t.factor + "-" + t.target);
}
return rftSet.size();
}
public static void main(String[] args) throws Throwable
{
Kronometre kron = new Kronometre();
GEM m = new GEM();
m.run();
kron.stop();
kron.print();
}
}