/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package jmotifx.motifx;
import database.AminoAcid;
import database.Database;
import database.Protein;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import jmotifx.preprocess.JMotifXBlockSequenceExtractor;
import jmotifx.JMotifXSequenceFileReader;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import jmotifx.preprocess.JMotifXAroundSitePeptideObjectsMaker;
import jmotifx.sequenceobjects.AroundSiteFPeptideObject;
/**
*
* @author paiyeta1
* databaseFilePath=./etc/databases/human.protein.aa
peptideSequenceFile=./etc/projects/jmotifx/inputs/cptacglycosites.seqs
backgroudSequenceFile=./etc/projects/jmotifx/inputs/dBGlycosites.seqs
backgroundType=input
peptideWindow=13
pValueCutOff=0.000001
minimumOccurrence=20
*
*/
public class MotifX {
private double sig_threshold = 0.000001;
private double significance = 0;
private int occurrence_threshold = 0;
private int seqOccurrence = 0;
//private ArrayList<AroundSiteFPeptideObject> glycObjs_f;
private ArrayList<AroundSiteFPeptideObject> nonRedundantAroundSitePeptideObjects;
private ArrayList<AroundSiteFPeptideObject> backgroundAroundSitePeptideObjects;
private LinkedList<MotifXElucidatedMotif> motifs;
//private Database dbase;
private int outer_itr_count = 0;
private int inner_itr_count = 0;
public MotifX(HashMap<String,String> configMap) throws FileNotFoundException, IOException{
/*
*
*/
String dbPath = configMap.get("databaseFilePath");
String outDir = configMap.get("outputDirectoryPath");
Database db = new Database(dbPath);
System.out.println(" Extracting input peptide sequences...");
// instantiate objects
JMotifXAroundSitePeptideObjectsMaker objsMaker = new JMotifXAroundSitePeptideObjectsMaker();
// data sequence....
MotifXPeptideSequenceExtractor sExtr = new MotifXPeptideSequenceExtractor();
ArrayList<String> inputBlockSequences = sExtr.extractInputPeptideSeqeunces(configMap, db);
nonRedundantAroundSitePeptideObjects = objsMaker.createSequencesAroundSiteFPeptideObjects(inputBlockSequences, configMap);
// background sequence....
// Two presupposition...
// (i) presuppose bg sequences are in block sequences
// (ii) presuppose bg sequences are non-redundant
MotifXBackgroundExtractor bGExtr = new MotifXBackgroundExtractor();
ArrayList<String> backgroundSequences = bGExtr.extractBGSequences(configMap, db);
backgroundAroundSitePeptideObjects = objsMaker.createSequencesAroundSiteFPeptideObjects(backgroundSequences, configMap);
motifs = new LinkedList<MotifXElucidatedMotif>();
ArrayList<MotifXMostSigResiduePosition> mSigRPs = new ArrayList<MotifXMostSigResiduePosition>();
seqOccurrence = nonRedundantAroundSitePeptideObjects.size();
System.out.println("\t\tBegining iterative approach to elucidate motif...");
while((significance < sig_threshold) && (seqOccurrence > occurrence_threshold)){
outer_itr_count++;
System.out.println("\t\t In Loop " + outer_itr_count + ":");
//4. Make degenerate residues - non-redundant identified data (Optional)
//5. Make degenerate residues - background data (Optional)
ArrayList<AroundSiteFPeptideObject> iterativeNonRedundantAroundSitePeptideObjects =
nonRedundantAroundSitePeptideObjects;
ArrayList<AroundSiteFPeptideObject> iterativeBackgroundAroundSitePeptideObjects =
backgroundAroundSitePeptideObjects;
printFGlycObjSequences(outDir, "outputs_outerLoop_", outer_itr_count,
iterativeNonRedundantAroundSitePeptideObjects, "identified");
printFGlycObjSequences(outDir, "outputs_outerLoop_", outer_itr_count,
iterativeBackgroundAroundSitePeptideObjects,"background");
occurrence_threshold = iterativeNonRedundantAroundSitePeptideObjects.size()/20; //more than 5% of starting data;
seqOccurrence = iterativeNonRedundantAroundSitePeptideObjects.size(); //starting occurrence value
//print Loop's properties to an appropriately named XML file
String outLoopDir = outDir + File.separator + "outputs_outerLoop_" + outer_itr_count;
while(seqOccurrence > occurrence_threshold){
inner_itr_count++;
System.out.println("\t\t In Loop " + outer_itr_count + "." + inner_itr_count + ":");
String inLoopDir = getIterationResultDirectory2(outLoopDir,outer_itr_count,inner_itr_count);
String loopID = String.valueOf(outer_itr_count) + "." + String.valueOf(inner_itr_count);
//6. Create position-weight-matrix ((+/-degenerate) identified data)
MotifXPositionWeightMatrix idPepPWM =
new MotifXPositionWeightMatrix(iterativeNonRedundantAroundSitePeptideObjects);
System.out.println("\t\t Printing peptide's position weight matrix");
printPWMatrix(idPepPWM.getMatrix(),idPepPWM.getPositions(),
idPepPWM.getAminoAcidSymbols(), inLoopDir + File.separator + "id.pwm.matrix");
//7. Create position-weight-matrix ((+/- degenerate) background data)
MotifXPositionWeightMatrix bGPepPWM =
new MotifXPositionWeightMatrix(iterativeBackgroundAroundSitePeptideObjects);
System.out.println("\t\t Printing background's weight matrix");
printPWMatrix(bGPepPWM.getMatrix(),bGPepPWM.getPositions(),
bGPepPWM.getAminoAcidSymbols(), inLoopDir + File.separator + "bg.pwm.matrix");
//8. Create binomial-probability-matrix (non-redundant (+/- degerate) identified PWM, (+/- degenerate) BG PWM)
//i.e the probability of observing s or more occurrences of residue x at position j (from
//identified matrix) given the background probability P for resiidue x at position j
System.out.println("\t\t Call to BinomialProbMatrix object");
MotifXBinomialProbabilityMatrix bPMatrix =
new MotifXBinomialProbabilityMatrix(idPepPWM,bGPepPWM);
System.out.println("\t\t Printing binomial distribution weight matrix");
printBinomialDistrMatrix(bPMatrix.getMatrix(),bPMatrix.getPositions(),
bPMatrix.getAminoAcidSymbols(), inLoopDir + File.separator + "id.binomdist.matrix");
//9. Calculate significance
//10. Identify the most statistically significant residue/position pair that
// satisfies/meet predefined binomial probability and minimum occurrence thresholds
System.out.println("\t\t Printing binomial distribution weight matrix");
MotifXMostSigResiduePosition mSigRP =
bPMatrix.getMostSignificant();
seqOccurrence = mSigRP.getOccurrence(iterativeNonRedundantAroundSitePeptideObjects);
significance = mSigRP.getProbability();
mSigRPs.add(mSigRP);
System.out.println("\t\t Printing Inner Loop " + loopID + " properties...");
String XMLoutput = printInnerLoopPropertiesXML(inLoopDir,loopID,mSigRP, significance, seqOccurrence,
iterativeNonRedundantAroundSitePeptideObjects,iterativeBackgroundAroundSitePeptideObjects);
System.out.println("\t\t Inner Loop " + loopID + " properties printed to " + XMLoutput);
if((seqOccurrence > occurrence_threshold)){
//11. Reduce datasets (identified data, background data, most significant residue/position)
//retain only those containing the most significant residue/position
MotifXReducedDatasets redData =
reduceDatasets1(iterativeNonRedundantAroundSitePeptideObjects,
iterativeBackgroundAroundSitePeptideObjects,mSigRP);
iterativeNonRedundantAroundSitePeptideObjects = redData.idGlyc;
iterativeBackgroundAroundSitePeptideObjects = redData.bGGlyc;
}
}
//12. repeat steps 6 to 10 until no more statistical significant pair that meet the occurrence threshold
//NB: the truncating step for this loop is failure to meet the occurrence threshold.
//13. Tally Found residue/position pair to identify motif.
//tallyRPToDefineMotif(mSigRPs);
//Motif motif = new MotifXElucidatedMotif(mSigRPs);
MotifXElucidatedMotif motif = new MotifXElucidatedMotif(mSigRPs,nonRedundantAroundSitePeptideObjects, backgroundAroundSitePeptideObjects);
System.out.println("\t\tFound " + motif.getMotifSequence() + " motif in outer_" + outer_itr_count + " iteration");
motifs.add(motif);
mSigRPs = new ArrayList<MotifXMostSigResiduePosition>();
//14. Reduce Main dataset by removing sequences that match the motif identified
// in the previous motif Building step.
printOuterLoopPropertiesXML(outDir,"outputs_outerLoop_", outer_itr_count,
motifs, nonRedundantAroundSitePeptideObjects, backgroundAroundSitePeptideObjects);
MotifXReducedDatasets redData2 = reduceDatasets2(nonRedundantAroundSitePeptideObjects,backgroundAroundSitePeptideObjects,motif);
nonRedundantAroundSitePeptideObjects = redData2.idGlyc;
backgroundAroundSitePeptideObjects = redData2.bGGlyc;
//reset seqOccurrence and/or significance threshold; inner_itr_count
seqOccurrence = nonRedundantAroundSitePeptideObjects.size();
inner_itr_count = 0;
}
//15. Repeat steps 6 - 13 until no more statiscally significant step is found.
/*
* As a way to summarize the entire process...
* print the observed motifs and other summary values to a table
*
*/
System.out.println("Returning...");
}
private MotifXReducedDatasets reduceDatasets1(ArrayList<AroundSiteFPeptideObject> glycObjs_nr, ArrayList<AroundSiteFPeptideObject> bGGlycObj, MotifXMostSigResiduePosition mSigRP) {
//throw new UnsupportedOperationException("Not yet implemented");
ArrayList<AroundSiteFPeptideObject> idGlyc_tmp = new ArrayList<AroundSiteFPeptideObject>();
ArrayList<AroundSiteFPeptideObject> bGGlyc_tmp = new ArrayList<AroundSiteFPeptideObject>();
Iterator<AroundSiteFPeptideObject> itr1 = glycObjs_nr.iterator();
while(itr1.hasNext()){
AroundSiteFPeptideObject glycObj = itr1.next();
if(mSigRP.inFGlycObjAaSeqArray(mSigRP.getResidueSymbol(), mSigRP.getResiduePosition(), glycObj)){
idGlyc_tmp.add(glycObj);
}
}
Iterator<AroundSiteFPeptideObject> itr2 = bGGlycObj.iterator();
while(itr2.hasNext()){
AroundSiteFPeptideObject bglycObj = itr2.next();
if(mSigRP.inFGlycObjAaSeqArray(mSigRP.getResidueSymbol(), mSigRP.getResiduePosition(), bglycObj)){
bGGlyc_tmp.add(bglycObj);
}
}
//glycObjs_nr = idGlyc_tmp;
//bGGlycObj = bGGlyc_tmp;
return(new MotifXReducedDatasets(idGlyc_tmp,bGGlyc_tmp));
}
/*
* contrary to reduceDatasets1 which selects sequences objects that have motif, reduceDataset2 selects sequence
* objects that do not have motif.
*/
private MotifXReducedDatasets reduceDatasets2(ArrayList<AroundSiteFPeptideObject> glycObjs_nr, ArrayList<AroundSiteFPeptideObject> bGGlycObj, MotifXElucidatedMotif motif) {
//throw new UnsupportedOperationException("Not yet implemented");
ArrayList<AroundSiteFPeptideObject> idGlyc_tmp = new ArrayList<AroundSiteFPeptideObject>();
ArrayList<AroundSiteFPeptideObject> bGGlyc_tmp = new ArrayList<AroundSiteFPeptideObject>();
Iterator<AroundSiteFPeptideObject> itr1 = glycObjs_nr.iterator();
while(itr1.hasNext()){
AroundSiteFPeptideObject glycObj = itr1.next();
//String motif_seq = motif.motif_seq;
String glycOb_seq = glycObj.getSequence();
if(!motif.matches(glycOb_seq)){
idGlyc_tmp.add(glycObj);
}
}
Iterator<AroundSiteFPeptideObject> itr2 = bGGlycObj.iterator();
while(itr2.hasNext()){
AroundSiteFPeptideObject bglycObj = itr2.next();
String bglycOb_seq = bglycObj.getSequence();
if(!motif.matches(bglycOb_seq)){
bGGlyc_tmp.add(bglycObj);
}
}
//glycObjs_nr = idGlyc_tmp;
//bGGlycObj = bGGlyc_tmp;
return(new MotifXReducedDatasets(idGlyc_tmp,bGGlyc_tmp));
}
public LinkedList<MotifXElucidatedMotif> getMotifsElucidated(){
return motifs;
}
private String getIterationResultDirectory(String outDir, String loop, int itr_count){
String itrDir;
File outD = new File(outDir);
if((!outD.exists()) || (!outD.isDirectory())){
outD.mkdir();
}
//make Loop's directory
itrDir = outDir + File.separator + loop + itr_count;
File loopDFile = new File(itrDir);
if(!loopDFile.exists()){
loopDFile.mkdir();
}
return itrDir;
}
private void printFGlycObjSequences(String outDir,
String loop, int itr_count,
ArrayList<AroundSiteFPeptideObject> fglycObjs, String type) {
//throw new UnsupportedOperationException("Not yet implemented");
//make output dir if it doesn't exist
String loopDir = getIterationResultDirectory(outDir, loop, itr_count);
try {
PrintWriter writer = new PrintWriter(loopDir + File.separator + type + ".seqs");
Iterator<AroundSiteFPeptideObject> itr = fglycObjs.iterator();
while(itr.hasNext()){
AroundSiteFPeptideObject fglycObj = itr.next();
writer.println(fglycObj.getSequence());
}
writer.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(MotifX.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void printOuterLoopPropertiesXML(String outDir, String outerLoop,
int outer_itr_count, LinkedList<MotifXElucidatedMotif> motifs1,
ArrayList<AroundSiteFPeptideObject> glycObj,
ArrayList<AroundSiteFPeptideObject> bGGlycObj) {
String loopDir = getIterationResultDirectory(outDir, outerLoop, outer_itr_count);
try {
//throw new UnsupportedOperationException("Not yet implemented");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.newDocument();
//create xml document elements
Element iteration_properties = doc.createElement("itr_properties");
Element loop = doc.createElement("loop");
loop.appendChild(doc.createTextNode("outer"));
Element iteration = doc.createElement("iteration");
iteration.appendChild(doc.createTextNode(String.valueOf(outer_itr_count)));
Element identified = doc.createElement("identified");
Attr size1 = doc.createAttribute("size");
size1.setValue(String.valueOf(glycObj.size()));
identified.setAttributeNode(size1);
Element background = doc.createElement("background");
Attr size2 = doc.createAttribute("size");
size2.setValue(String.valueOf(bGGlycObj.size()));
background.setAttributeNode(size2);
Element motifs_found = doc.createElement("motifs");
Attr number = doc.createAttribute("number");
number.setValue(String.valueOf(motifs1.size()));
motifs_found.setAttributeNode(number);
if(motifs1.size() > 0){
Iterator<MotifXElucidatedMotif> itr = motifs1.iterator();
while(itr.hasNext()){
MotifXElucidatedMotif mot = itr.next();
Element mot_element= doc.createElement("motif");
Element mot_rep = doc.createElement("representation");
mot_rep.appendChild(doc.createTextNode(String.valueOf(mot.getMotifSequence())));
mot_element.appendChild(mot_rep);
Element mot_score = doc.createElement("score");
mot_score.appendChild(doc.createTextNode(String.valueOf(mot.getMotifScore())));
mot_element.appendChild(mot_score);
//Attr mot_score = doc.createAttribute("score");
//mot_score.setValue(String.valueOf(mot.getMotifScore()));
//mot_element.setAttributeNode(mot_score);
//mot_element.appendChild(doc.createTextNode(mot.getMotifSequence()));
Element mot_occurrence = doc.createElement("occurrence");
Attr attr1 = doc.createAttribute("in_identified");
attr1.setValue(String.valueOf(mot.getMatches(glycObj).size()));
mot_occurrence.setAttributeNode(attr1);
Attr attr2 = doc.createAttribute("in_background");
attr2.setValue(String.valueOf(mot.getMatches(bGGlycObj).size()));
mot_occurrence.setAttributeNode(attr2);
mot_element.appendChild(mot_occurrence);
motifs_found.appendChild(mot_element);
}
}
iteration_properties.appendChild(loop);
iteration_properties.appendChild(iteration);
iteration_properties.appendChild(identified);
iteration_properties.appendChild(background);
iteration_properties.appendChild(motifs_found);
doc.appendChild(iteration_properties);
//printxml
TransformerFactory transFactory = TransformerFactory.newInstance();
Transformer transformer = transFactory.newTransformer();
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(new File(loopDir + File.separator +
"outerLoop_" + outer_itr_count + ".properties.xml"));
transformer.transform(source, result);
} catch (TransformerException ex) {
Logger.getLogger(MotifX.class.getName()).log(Level.SEVERE, null, ex);
} catch (ParserConfigurationException ex) {
Logger.getLogger(MotifX.class.getName()).log(Level.SEVERE, null, ex);
}
//return loopDir;
}
private String getIterationResultDirectory2(String parent, int outer_itr_count, int inner_itr_count) {
//throw new UnsupportedOperationException("Not yet implemented");
String itrDir;
File outD = new File(parent);
if((!outD.exists()) || (!outD.isDirectory())){
outD.mkdir();
}
//make Loop's directory
itrDir = outD + File.separator + "outputs_innerLoop_" + outer_itr_count + "." + inner_itr_count;
File loopDFile = new File(itrDir);
if(!loopDFile.exists()){
loopDFile.mkdir();
}
return itrDir;
}
private void printPWMatrix(int matrix[][], int[] id, char[] aa, String out){
try {
PrintWriter writer = new PrintWriter(out);
//print header
for(int i = 0; i < id.length; i++){
if(i < id.length-1){
writer.print(id[i] + "\t");
}
if(i==id.length-1){
writer.println(id[i]);
}
}
//print rows
for(int j = 0; j < aa.length; j++){
writer.print(aa[j] + "\t");
for(int k = 0; k < id.length; k++){
if(k < id.length-1){
writer.print(matrix[j][k] + "\t");
}
if(k==id.length-1){
writer.println(matrix[j][k]);
}
}
}
writer.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(MotifXPositionWeightMatrix.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void printBinomialDistrMatrix(double matrix[][], int[] id, char[] aa, String out){
try {
PrintWriter writer = new PrintWriter(out);
//print header
for(int i = 0; i < id.length; i++){
if(i < id.length-1){
writer.print(id[i] + "\t");
}
if(i==id.length-1){
writer.println(id[i]);
}
}
//print rows
for(int j = 0; j < aa.length; j++){
writer.print(aa[j] + "\t");
for(int k = 0; k < id.length; k++){
if(k < id.length-1){
writer.print(matrix[j][k] + "\t");
}
if(k==id.length-1){
writer.println(matrix[j][k]);
}
}
}
writer.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(MotifXPositionWeightMatrix.class.getName()).log(Level.SEVERE, null, ex);
}
}
private String printInnerLoopPropertiesXML(String inLoopDir, String loopID,
MotifXMostSigResiduePosition mSigRP, double significance,
int seqOccurrence, ArrayList<AroundSiteFPeptideObject> glycObj,
ArrayList<AroundSiteFPeptideObject> fGlycObj) {
String output = inLoopDir + File.separator + "innerLoop_" + loopID + ".properties.xml";
try {
//throw new UnsupportedOperationException("Not yet implemented");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = factory.newDocumentBuilder();
Document doc = docBuilder.newDocument();
Element root = doc.createElement("inner_itr_properties");
Element loop = doc.createElement("loop");
loop.appendChild(doc.createTextNode(loopID));
Element msRP = doc.createElement("most_significant");
Attr attr1 = doc.createAttribute("residue");
Attr attr2 = doc.createAttribute("position");
attr1.setValue(String.valueOf(mSigRP.getResidueSymbol()));
attr2.setValue(String.valueOf(mSigRP.getResiduePosition()));
msRP.setAttributeNode(attr1);
msRP.setAttributeNode(attr2);
Element sig = doc.createElement("significance");
Attr sig_attr = doc.createAttribute("value");
sig_attr.setValue(String.valueOf(significance));
sig.appendChild(doc.createTextNode(String.valueOf(significance)));
msRP.appendChild(sig);
Element id = doc.createElement("identified");
Attr id_attr1 = doc.createAttribute("size");
Attr id_attr2 = doc.createAttribute("occurrence");
id_attr1.setValue(String.valueOf(glycObj.size()));
id_attr2.setValue(String.valueOf(seqOccurrence));
id.setAttributeNode(id_attr1);
id.setAttributeNode(id_attr2);
msRP.appendChild(id);
Element bg = doc.createElement("background");
Attr bg_attr1 = doc.createAttribute("size");
Attr bg_attr2 = doc.createAttribute("occurrence");
bg_attr1.setValue(String.valueOf(fGlycObj.size()));
bg_attr2.setValue(String.valueOf(mSigRP.getOccurrence(fGlycObj)));
bg.setAttributeNode(bg_attr1);
bg.setAttributeNode(bg_attr2);
msRP.appendChild(bg);
root.appendChild(loop);
root.appendChild(msRP);
doc.appendChild(root);
TransformerFactory tFactory = TransformerFactory.newInstance();
Transformer transformer = tFactory.newTransformer();
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(new File(output));
transformer.transform(source, result);
} catch (TransformerException ex) {
Logger.getLogger(MotifX.class.getName()).log(Level.SEVERE, null, ex);
} catch (ParserConfigurationException ex) {
Logger.getLogger(MotifX.class.getName()).log(Level.SEVERE, null, ex);
}
return output;
}
public void printMotifElucidationSummary(LinkedList<MotifXElucidatedMotif> motifs, String output) throws FileNotFoundException {
//throw new UnsupportedOperationException("Not yet implemented");
System.out.println("Printing Elucidated motifs (as result summary)...");
//System.out.println("Print algorithm result summary...");
PrintWriter printer = new PrintWriter(output);
//print table-header
printer.println("Motif\tScore\tData matches\tData size\tBackground data matches\tBackground data size");
//print table-body
for(MotifXElucidatedMotif motif: motifs){
printer.println(motif.getMotifSequence() + "\t" +
motif.getMotif_score() + "\t" +
motif.getIdPepMatches().size() + "\t" +
motif.getIdSequencesSize() + "\t" +
motif.getBgPepMatches().size() + "\t" +
motif.getBgSequencesSize());
}
printer.close();
}
}