Package gem

Source Code of gem.GEM

package gem;

import gem.util.Kronometre;
import gem.util.Progress;

import java.util.*;

/**
* Main class to run the GEM implementation. Make sure that the constant <code>TRIPLET_FILE</code>
* points to the hypothesis file, where each row contains Entrez Gene IDs of triplets. The false
* discovery rate is controlled by the <code>FDR</code> constant. The constant <code<MINVAR</code>
* is for filtering genes with low variance in the expression dataset. Any triplet hypothesis
* containing a gene with variance under this threshold is filtered out.
*/
public class GEM implements Constants
{
  /**
   * Names of expression dataset files.
   */
  private static final String EXPFILE[] = {
//    "resource/exp_big_1.txt", "resource/exp_big_2.txt", "resource/exp_big_3.txt", "resource/exp_big_4.txt"};
//    "resource/experiments_expO_1.txt", "resource/experiments_expO_2.txt"};
    "resource/expop/expop_1.txt", "resource/expop/expop_2.txt", "resource/expop/expop_3.txt"};
//    "resource/exp_Treg_1.exp", "resource/exp_Treg_2.exp", "resource/exp_Treg_3.exp"};

  /**
   * Flag to randomize the order of experiments for a gene. This is used for a randomization test.
   */
  private static final boolean RND = false;

  /**
   * Flag to run the analysis only on some triplets, which are tagged as debug.
   */
  private static final boolean DEBUG_TRIP = false;


  private static final boolean KEEP_OR = false;

  public static final String FACNAME = "AR";
  /**
   * File for initial triplet hypotheses. This is a tab-delimited text file, each row containing
   * an hypothesis. Entrez Gene IDs of M, F, and T are separated by tabs.
   */
  private static String TRIPLET_FILE = "resource/factor-trips/" + FACNAME + "-select.txt";
//  private static String TRIPLET_FILE = "resource/AR-hprd-tredlit-trips.txt";
//  private static String TRIPLET_FILE = "resource/triplets.txt";
//  private static String TRIPLET_FILE = "resource/NR3C1-AR-trips.txt";
//  private static String TRIPLET_FILE = "resource/tartrips/triplets_tar_mod-mod_all.txt";
//  private static String TRIPLET_FILE = "resource/triplets_FOXP3.txt";

  // These sets of M, F, and T are used for debugging, for filtering triplets to the desired
  // M, F, or T.

  private static String[] MODULATOR = new String[]{};//"NR3C1"};
  private static String[] FACTOR = new String[]{};//"AR"};
  private static String[] TARGET = new String[]{};//"NOL3"};

  /**
   * Desired false discovery rate.
   */
  private static final double FDR = 0.05;

  /**
   * Variance threshold for genes. Any triplet that contains a gene with a lower variance will be
   * filtered out.
   */
  public static final double MINVAR = 10;
  public static final double MINVARLOG = 0;

  /**
   * Name of the result file.
   */
  private static String RESULT_FILE = "result/Result" +
    "_fdr" + FDR + "_var" + MINVAR + (RND ? "_rnd" : "") + "_" + FACNAME +
    (MODULATOR.length > 0 ? "_M_" + MODULATOR[0] : "") +
    (FACTOR.length > 0 ? "_F_" + FACTOR[0] : "") +
    (TARGET.length > 0 ? "_T_" + TARGET[0] : "") +
    "_expo_select.xls";

  /**
   * Method that runs the analysis.
   */
  public void run() throws Throwable
  {
    // Load experiments
    List<Triplet> trips = Triplet.readTrips(TRIPLET_FILE);

    // Continue with desired M, F, and T if a selection was given

    if (TARGET.length > 0) trips = Triplet.selectTargets(trips, Arrays.asList(TARGET));
    if (FACTOR.length > 0) trips = Triplet.selectFactors(trips, Arrays.asList(FACTOR));
    if (MODULATOR.length > 0) trips = Triplet.selectModulators(trips, Arrays.asList(MODULATOR));

    // Remove ETV1-AR interaction from triplets. This is a false positive entry in HPRD. We
    // noticed it while investigating our results. GEM reported ETV1 as a strong modulator of
    // AR, but we could not find any evidence in the literature. Then we investigated the
    // evidence of ETV1-AR interaction in the HPRD database, and realized that the related paper
    // in HPRD databse is actually not related to ETV1. Dr. Renu Goel from HPRD approved that
    // ETV1-AR interaction is an incorrect entry in HPRD database.

    Iterator<Triplet> iter = trips.iterator();
    while(iter.hasNext())
    {
      Triplet t = iter.next();
      if (t.modulator.equals("2119") && t.factor.equals("367"))
      {
        iter.remove();
      }
    }

    System.out.println("triplets initial size = " + trips.size());
    System.out.println("initial unique F = " + countUniqueF(trips));
    System.out.println("initial unique M-F = " + countUniqueMF(trips));
    System.out.println("initial unique F-T = " + countUniqueFT(trips));

//    // Gets Entrez Gene IDs of genes in triplets. This is used for avoiding unecessary reading
//    // of expression data.
//    Set<String> geneIDs = Triplet.getGeneIDs(trips);
//
//    System.out.println(geneIDs.size() + " genes present in triplets");
//
//    // Read expression dataset
//    Map<String, List<Gene>> geneMap = Gene.readGenes(geneIDs, MINVAR, EXPFILE);
//
////    System.out.print("Keeping only most varied ... ");
////    Gene.keepMostVaried(geneMap);
////    System.out.println("ok");
//
//    // Report number of genes and experiments read.
//
//    int expsize = geneMap.values().iterator().next().iterator().next().getExpSize();
//    System.out.println("Experiments have " + geneMap.size() + " genes and " +
//      expsize + " samples.");
//
//    // Continue with debug triplets if DEBUG_TRIP flag is on
//    if (DEBUG_TRIP) trips = Triplet.getDebug(trips);
//
//    System.out.println("Triplets size = " + trips.size());
//
//    // Randomize experiments for a randomization test if RND flag is on
//    if (RND) Gene.randomize(geneMap.values());
//
//    // Associate triplets to the experiment data read
//    trips = Gene.associateExpsToTrips(geneMap, trips);
//    geneMap = null;

    trips = ExpDataReader.associate(trips, "resource/expdata/expo", MINVAR, MINVARLOG);

    CellTypeMatcher.filterToGender(Triplet.collectGenes(trips), true);

    printTripStats(trips, "After filtering and multiplying");

    // If the above processing didn't leave any triplet, return
    if (trips.isEmpty()) return;

    // Assign discrete values to genes for each experiment, i.e., low, null, and high
//    boolean[] ignore = CellTypeMatcher.getNonProstateNonExpo();
//    CaseCounter.count(trips, 1D / 3, ignore);
    CaseCounter.count(trips, 1D / 3);

//    trips = Triplet.filterToStateExistence(trips);
//    Difference.assignGammaPval(trips);
//    double pv_thr = Triplet.getPvalGammaThreshold(trips, FDR);
//    System.out.println("First for fdr " + FDR + " = " + pv_thr);
//    trips = Triplet.filterToPvalGamma(trips, pv_thr);

    // If any M-F state of a triplet is not represented in the discretized data,
    // fileter out these triplets
    trips = Triplet.filterToStateExistence(trips);

    // Calculate gamma
    Difference.assignGammaPval(trips);

    // Find the gamma significance thresold for the desired FDR
    double pv_thr = Triplet.getPvalGammaThreshold(trips, FDR);

    System.out.println("Threshold gamma significance for fdr " + FDR + " = " + pv_thr);

    // Filter out triplets with insignificant gamma
    trips = Triplet.filterToPvalGamma(trips, pv_thr);

    // Assign modulation categories
    TripletClassifier.assignClass(trips, 0.05);

//    Set<Gene> absentMods = getAbsentMods(trips);
//    trips = Difference.assignBestCountsConsiderAbsent(trips, absentMods);
//    Tester.calcDependencyMatrix(trips, pv_thr);
//    Tester.calcIndependencyRatios(trips);

//-------------------
//    printTripStats(trips, "After gamma filtering");
//
//    // Calculate betaM
//    Difference.assignBetaMPval(trips);
//
//    // Find the threshold betaM pval for the desired FDR
//    double pvalBetaMThr = Triplet.getPvalBetaMThreshold(trips, FDR);
//
//    System.out.println("Threshold betaM significance for fdr " + FDR + " = " + pvalBetaMThr);
//
//    // Filter out triplets where alphaM / betaM >= 1
//
//    trips = Triplet.filterToPvalBetaM(trips, pvalBetaMThr, KEEP_OR);
//    printTripStats(trips, "After betaM filtering");
//
//    trips = Triplet.filterToLogicalAND(trips);
//    printTripStats(trips, "After logical-or filtering");

    // Check the monotonicity of the dependency function, assumed during discretization. Filter
    // out non-monotonic triplets.
//    trips = Triplet.filterToMonotonic(trips);
//    printTripStats(trips, "After monotonicity filtering");
//------------------------

    String[] pos = CellTypeMatcher.getExpoGenderFilteredTissueNamesArray(true);
    for (Triplet t : trips)
    {
      if (t.modulator.equals("207") && t.target.equals("7113"))
        CaseCounter.printTissuesInCounts(t, pos);
    }

    printTripStats(trips, "After everything");

    if (!trips.isEmpty())
    {
      //Record significant modulations to the result file
      Triplet.record(trips, RESULT_FILE);
    }
  }

  private static Set<Gene> getAbsentMods(List<Triplet> trips)
  {
    Set<String> names = new HashSet<String>(Arrays.asList("AHR", "CASP1", "CDK6", "EFCAB6", "FHL2", "FLNA", "GRIP1", "IFI16", "NR3C1", "PAK6", "RUNX1", "TCF4", "TGFB1I1"));
//    Set<String> names = new HashSet<String>(Arrays.asList("EFCAB6", "GRIP1", "PAK6", "RUNX1", "TGFB1I1"));

    Set<Gene> absent = new HashSet<Gene>();

    for (Triplet t : trips)
    {
      String sym = t.getMSym();

      if (names.contains(sym)) absent.add(t.M);
    }
    return absent;
  }

  /**
   * Prints some statistics about the list of triplet hypothesis to the console.
   *
   * @param trips
   * @param msg
   */
  private static void printTripStats(List<Triplet> trips, String msg)
  {
    Set<String> entrezRSet = new HashSet<String>();
    Set<String> gbRSet = new HashSet<String>();
    Set<String> entrezFSet = new HashSet<String>();
    Set<String> gbFSet = new HashSet<String>();
    Set<String> entrezTSet = new HashSet<String>();
    Set<String> gbTSet = new HashSet<String>();

    for (Triplet t : trips)
    {
      entrezRSet.add(t.M.geneid);
      gbRSet.add(t.M.getGenBank());
      entrezFSet.add(t.F.geneid);
      gbFSet.add(t.F.getGenBank());
      entrezTSet.add(t.T.geneid);
      gbTSet.add(t.T.getGenBank());
    }
    System.out.println("\n-----" + msg + "---");
    System.out.println("size of trips = " + trips.size());
    System.out.println("Entrez Gene Mods = " + entrezRSet.size());
    System.out.println("GenBank regs = " + gbRSet.size());
    System.out.println("Entrez Gene Factors = " + entrezFSet.size());
    System.out.println("GenBank Factors = " + gbFSet.size());
    System.out.println("Entrez Gene Targets = " + entrezTSet.size());
    System.out.println("GenBank Targets = " + gbTSet.size());
    System.out.println("Unique M-F = " + countUniqueMF(trips));
    System.out.println("Unique F-T = " + countUniqueFT(trips));
    System.out.println("Unique M-F-T = " + countUniqueMFT(trips));
    System.out.println("--------\n");
  }

  private static int countUniqueF(List<Triplet> trips)
  {
    Set<String> rfSet = new HashSet<String>();

    for (Triplet t : trips)
    {
      rfSet.add(t.factor);
    }
    return rfSet.size();
  }

  private static int countUniqueMF(List<Triplet> trips)
  {
    Set<String> rfSet = new HashSet<String>();

    for (Triplet t : trips)
    {
      rfSet.add(t.modulator + "-" + t.factor);
    }
    return rfSet.size();
  }

  private static int countUniqueFT(List<Triplet> trips)
  {
    Set<String> rfSet = new HashSet<String>();

    for (Triplet t : trips)
    {
      rfSet.add(t.factor + "-" + t.target);
    }
    return rfSet.size();
  }

  private static int countUniqueMFT(List<Triplet> trips)
  {
    Set<String> rftSet = new HashSet<String>();

    for (Triplet t : trips)
    {
      rftSet.add(t.modulator + "-" + t.factor + "-" + t.target);
    }
    return rftSet.size();
  }

  public static void main(String[] args) throws Throwable
  {
    Kronometre kron = new Kronometre();
    GEM m = new GEM();
    m.run();
    kron.stop();
    kron.print();
  }
}
TOP

Related Classes of gem.GEM

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.