Package ru.autosome.macroape.cli.generalized

Source Code of ru.autosome.macroape.cli.generalized.CollectDistanceMatrix$PWMWithThreshold

package ru.autosome.macroape.cli.generalized;

import ru.autosome.ape.calculation.findThreshold.CanFindThreshold;
import ru.autosome.ape.calculation.findThreshold.FindThresholdAPE;
import ru.autosome.ape.model.exception.HashOverflowException;
import ru.autosome.commons.backgroundModel.GeneralizedBackgroundModel;
import ru.autosome.commons.model.BoundaryType;
import ru.autosome.commons.model.Discretizer;
import ru.autosome.commons.motifModel.*;
import ru.autosome.commons.motifModel.types.DataModel;
import ru.autosome.macroape.calculation.generalized.CompareModelsCountsGiven;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

public abstract class CollectDistanceMatrix<ModelType extends Discretable<ModelType> & ScoreDistribution<BackgroundType> & Alignable<ModelType> & Named & ScoringModel, BackgroundType extends GeneralizedBackgroundModel> {
  class PWMWithThreshold {
    final ModelType pwm;
    final double roughThreshold;
    final double roughCount;
    final double preciseThreshold;
    final double preciseCount;
    PWMWithThreshold(ModelType pwm,
                     double roughThreshold, double roughCount,
                     double preciseThreshold, double preciseCount) {
      this.pwm = pwm;
      this.roughThreshold = roughThreshold;
      this.roughCount = roughCount;
      this.preciseThreshold = preciseThreshold;
      this.preciseCount = preciseCount;
    }
  }

  abstract protected String DOC_background_option();
  abstract protected String DOC_run_string();
  protected String documentString() {
    return "Command-line format:\n" +
            DOC_run_string() + " <folder with PWMs> [options]\n" +
            "\n" +
            "Options:\n" +
            "  [--rough-discretization <discretization level>] or [-d]\n" +
            "  [--precise-discretization <discretization level>]\n" +
            "  [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set\n" +
            "  [-p <P-value>]\n" +
            "  [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value\n" +
            "  [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.\n" +
            "  [--ppm] or [--pfm] - treat the input file as Position Frequency Matrix. PPM-to-PWM transformation to be done internally.\n" +
            "  [--effective-count <count>] - effective samples set size for PPM-to-PWM conversion (default: 100). \n" +
            "  [-b <background probabilities] " + DOC_background_option() + "\n" +
            "  [--transpose] - load motif from transposed matrix (nucleotides in lines).\n" +
            "  [--parallelize <num of threads> <thread number>] - run only one task per numOfThreads (those equal to thread number modulo numOfThreads)\n" +
            DOC_additional_options() +
            "\n" +
            "Examples:\n" +
            "  " + DOC_run_string() + " ./motifs/ -d 10\n";
  }

  protected String DOC_additional_options() {
    return "";
  }

  protected Discretizer roughDiscretizer, preciseDiscretizer;
  protected File pathToCollectionOfPWMs;
  protected BackgroundType background;
  protected DataModel dataModel;
  protected Integer maxHashSize, maxPairHashSize;
  protected double effectiveCount;
  protected BoundaryType pvalueBoundary;
  protected double pvalue;
  protected Double preciseRecalculationCutoff; // null means that no recalculation will be performed
  protected boolean transpose;

  protected int numOfThreads, numThread;
  protected List<ModelType> pwmCollection;

  protected void setup_from_arglist(List<String> argv) {
    extract_path_to_collection_of_pwms(argv);
    while (argv.size() > 0) {
      extract_option(argv);
    }
    pwmCollection = loadMotifCollection(pathToCollectionOfPWMs);
  }

  protected abstract List<ModelType> loadMotifCollection(File path_to_collection);

  protected abstract void initialize_default_background();

  protected void initialize_defaults() {
    initialize_default_background();
    roughDiscretizer = new Discretizer(1.0);
    preciseDiscretizer = new Discretizer(10.0);
    maxHashSize = 10000000;
    maxPairHashSize = 10000;
    dataModel = DataModel.PWM;
    effectiveCount = 100;
    pvalue = 0.0005;
    pvalueBoundary = BoundaryType.UPPER;
    preciseRecalculationCutoff = null;
    transpose = false;

    numOfThreads = 1;
    numThread = 0;

    pathToCollectionOfPWMs = null;
    pwmCollection = null;
  }

  protected void extract_option(List<String> argv) {
    String opt = argv.remove(0);
    if (opt.equals("-b")) {
      background = extract_background(argv.remove(0));
    } else if(opt.equals("-p")) {
      pvalue = Double.valueOf(argv.remove(0));
    } else if (opt.equals("--max-hash-size")) {
      maxHashSize = Integer.valueOf(argv.remove(0));
    } else if (opt.equals("--max-2d-hash-size")) {
      maxPairHashSize = Integer.valueOf(argv.remove(0));
    } else if (opt.equals("--rough-discretization") || opt.equals("-d")) {
      roughDiscretizer = Discretizer.fromString(argv.remove(0));
    } else if (opt.equals("--precise-discretization")) {
      preciseDiscretizer = Discretizer.fromString(argv.remove(0));
    } else if (opt.equals("--pcm")) {
      dataModel = DataModel.PCM;
    } else if (opt.equals("--ppm") || opt.equals("--pfm")) {
      dataModel = DataModel.PPM;
    } else if (opt.equals("--effective-count")) {
      effectiveCount = Double.valueOf(argv.remove(0));
    } else if (opt.equals("--boundary")) {
      pvalueBoundary = BoundaryType.valueOf(argv.remove(0).toUpperCase());
    } else if (opt.equals("--precise")) {
      preciseRecalculationCutoff = Double.valueOf(argv.remove(0));
    } else if (opt.equals("--parallelize")) {
      numOfThreads = Integer.valueOf(argv.remove(0));
      numThread = Integer.valueOf(argv.remove(0));
    } else if (opt.equals("--transpose")) {
      transpose = true;
    } else {
      if (failed_to_recognize_additional_options(opt, argv)) {
        throw new IllegalArgumentException("Unknown option '" + opt + "'");
      }
    }
  }

  protected boolean failed_to_recognize_additional_options(String opt, List<String> argv) {
    return true;
  }

  protected void extract_path_to_collection_of_pwms(List<String> argv) {
    try {
      pathToCollectionOfPWMs = new File(argv.remove(0));
    } catch (IndexOutOfBoundsException e) {
      throw new IllegalArgumentException("Specify PWM-collection folder", e);
    }
  }

  protected List<PWMWithThreshold> collectThreshold() throws HashOverflowException {
    List<PWMWithThreshold> result = new ArrayList<PWMWithThreshold>();
    for (ModelType pwm: pwmCollection) {
      CanFindThreshold roughThresholdCalculator = new FindThresholdAPE<ModelType, BackgroundType>(pwm, background, roughDiscretizer, maxHashSize);
      CanFindThreshold.ThresholdInfo roughThresholdInfo = roughThresholdCalculator.thresholdByPvalue(pvalue, pvalueBoundary);
      double roughThreshold = roughThresholdInfo.threshold;
      double roughCount = roughThresholdInfo.numberOfRecognizedWords(background, pwm.length());

      CanFindThreshold preciseThresholdCalculator = new FindThresholdAPE<ModelType, BackgroundType>(pwm, background, preciseDiscretizer, maxHashSize);
      CanFindThreshold.ThresholdInfo preciseThresholdInfo = preciseThresholdCalculator.thresholdByPvalue(pvalue, pvalueBoundary);
      double preciseThreshold = preciseThresholdInfo.threshold;
      double preciseCount = preciseThresholdInfo.numberOfRecognizedWords(background, pwm.length());

      result.add(new PWMWithThreshold(pwm,
                                      roughThreshold, roughCount,
                                      preciseThreshold, preciseCount));
    }
    return result;
  }

  protected double calculateDistance(PWMWithThreshold first, PWMWithThreshold second) throws HashOverflowException {
    CompareModelsCountsGiven calc;
    CompareModelsCountsGiven.SimilarityInfo info;
    calc = calculator(first.pwm, second.pwm);

    info = calc.jaccard(first.roughThreshold, second.roughThreshold,
                        first.roughCount, second.roughCount);
    if (preciseRecalculationCutoff != null && info.similarity() > preciseRecalculationCutoff) {
      calc = calculator(first.pwm, second.pwm);
      info = calc.jaccard(first.preciseThreshold, second.preciseThreshold,
                          first.preciseCount, second.preciseCount);
    }
    return info.distance();
  }

  public void process() throws HashOverflowException {
    int taskNum = 0;
    List<PWMWithThreshold> thresholds = collectThreshold();
    Collections.sort(thresholds, new Comparator<PWMWithThreshold>() {
      @Override
      public int compare(PWMWithThreshold o1, PWMWithThreshold o2) {
        return o1.pwm.getName().compareTo(o2.pwm.getName());
      }
    });

    System.out.print("Motif name"+ "\t");
    for(PWMWithThreshold second: thresholds) {
      System.out.print(second.pwm.getName() + "\t");
    }
    System.out.println();
    for(PWMWithThreshold first: thresholds) {
      System.out.print(first.pwm.getName() + "\t");
      for(PWMWithThreshold second: thresholds) {

        if (taskNum % numOfThreads == numThread % numOfThreads) {
          // so that numThread in range 0..(n-1) was equal to 1..n
          int cmp = first.pwm.getName().compareTo(second.pwm.getName());
          if (cmp == 0) {
            System.out.print("0.0\t");
          } else if (cmp < 0) {
            System.out.print("x\t");
          } else {
            double distance = calculateDistance(first, second);
            System.out.print(distance + "\t");
          }
        } else {
          System.out.print("x\t");
        }
        taskNum += 1;

      }
      System.out.println();
      System.err.print(".");
    }
  }

  abstract protected CompareModelsCountsGiven<ModelType, BackgroundType> calculator(ModelType firstModel, ModelType secondModel);
  abstract protected BackgroundType extract_background(String str);
}
TOP

Related Classes of ru.autosome.macroape.cli.generalized.CollectDistanceMatrix$PWMWithThreshold

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.