/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package jmotifx.preprocess;
import database.AminoAcid;
import database.Database;
import database.Protein;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import jmotifx.JMotifXSequenceFileReader;
import jmotifx.sequenceobjects.AroundSitePeptideObject;
/**
*
* @author paiyeta1
*/
public class JMotifXBlockSequenceExtractor {
public ArrayList<String> extractBlockSeqs4rmDBWithMotifNCentralResidue(HashMap<String, String> configMap, Database db) {
//throw new UnsupportedOperationException("Not yet implemented");
ArrayList<String> blockSequences = new ArrayList<String>();
String motifRegularExpression = configMap.get("motif");
String centralResidue = configMap.get("centerResidue");
int peptideWindow = Integer.parseInt(configMap.get("peptideWindow"));
ArrayList<Protein> proteins = db.getProteins();
//for each protein, extract central residue indeces
for(Protein protein : proteins){
ArrayList<Integer> residueIndeces = protein.getResidueIndeces(centralResidue);
//for each residue index get block sequence around it
for(int residueIndex : residueIndeces){
String proteinSequence = protein.getSequence();
String blockSequence = extractBlockSequence(proteinSequence, residueIndex, peptideWindow);
//if block sequence matches prespecified motif, add to the list of bloc sequences
if(blockSequence.matches(motifRegularExpression))
blockSequences.add(blockSequence);
}
}
return blockSequences;
}
/*
* Extracts a peptide sequence about a given residue index from a protein sequence.
* @param proteinSequence
* @param index
* @param peptideWindow
* @return blockSequence
*/
public String extractBlockSequence(String proteinSequence, int index, int peptideWindow){
String blockSequence = null;
int sequenceLength = proteinSequence.length();
int preNPostCenterLength = peptideWindow/2;
if (index > preNPostCenterLength && (((sequenceLength-1) - index) >= preNPostCenterLength) ){ // pre > 50 and post >= 50
blockSequence = proteinSequence.substring(index - preNPostCenterLength, index + preNPostCenterLength + 1);
}
else if(index > preNPostCenterLength && (((sequenceLength-1) - index) < preNPostCenterLength)){//pre > 50 and post glycosite < 50
//determine the length of the peptide substring
blockSequence = proteinSequence.substring(index - preNPostCenterLength);
int lengthOfFill = preNPostCenterLength - ((sequenceLength-1) - index);
blockSequence = blockSequence + getFillerString(lengthOfFill);
}
else if(index <= preNPostCenterLength && (((sequenceLength-1) - index) >= preNPostCenterLength)){// pre <= 50, post > 50
blockSequence = proteinSequence.substring(0, index + preNPostCenterLength + 1);
int lengthOfFill = preNPostCenterLength - (index);
blockSequence = getFillerString(lengthOfFill) + blockSequence;
}
else if(sequenceLength - index < preNPostCenterLength && index <= preNPostCenterLength){//pre <= 50, post < 50
blockSequence = proteinSequence;
int preFillLength = preNPostCenterLength - (index);
int postFillLength = preNPostCenterLength - ((sequenceLength-1) - index) ;
blockSequence = getFillerString(preFillLength) + blockSequence + getFillerString(postFillLength);
}
return blockSequence;
}
//newer implementation of block sequence extractor routine - 20140417
public ArrayList<String> extractBlockSequence(ArrayList<String> inputSequences,
HashMap<String, String> configMap, Database db) throws FileNotFoundException,
IOException {
//throw new UnsupportedOperationException("Not yet implemented");
String prespecifiedMotif = configMap.get("motif"); // get prespecified motif
String centerResidue = configMap.get("centerResidue"); // get residue about which sequence should be centered...
int peptideWindow = Integer.parseInt(configMap.get("peptideWindow")); // get peptide preNPostCenterLength
// presupposing a motif is prespecified.
ArrayList<String> prespecifiedMotifContainingSequences =
extractMotifContainingSequences(inputSequences,prespecifiedMotif);
ArrayList<String> blockSequences =
extractBlockSequenceThisWindow(prespecifiedMotifContainingSequences, peptideWindow, db, centerResidue);
return blockSequences;
}
public ArrayList<String> extractBlockSequenceThisWindow(ArrayList<String> sequences, int window, Database db, String center)
throws FileNotFoundException, IOException{
HashMap<String, LinkedList<String>> pep2ProtAccnsMap = mapPep2ProteinTitleLines(sequences, db);
ArrayList<String> blockSequence = extractBlockSequence(pep2ProtAccnsMap, window, db, center);
return blockSequence;
}
private HashMap<String, LinkedList<String>> mapPep2ProteinTitleLines(ArrayList<String> glycs, Database db) {
System.out.println(" Mapping identified glycosites to protein accessions...");
HashMap<String, LinkedList<String>> pep2ProtTitleLinesMap = new HashMap<String, LinkedList<String>>();
for(String glyc : glycs){
glyc = glyc.replaceAll("[^A-Z]","");
LinkedList<String> protTitleLines = getMappedProtTitleLines(glyc, db);
pep2ProtTitleLinesMap.put(glyc, protTitleLines);
}
return pep2ProtTitleLinesMap;
}
private LinkedList<String> getMappedProtTitleLines(String glyc, Database db) {
//throw new UnsupportedOperationException("Not yet implemented");
LinkedList<String> mappedProtTitleLines = new LinkedList<String>();
ArrayList<Protein> proteins = db.getProteins();
for(Protein prot : proteins){
if(prot.getSequence().contains(glyc)){
mappedProtTitleLines.add(prot.getTitleLine()); //the implemented part
}
}
return mappedProtTitleLines;
}
public ArrayList<String> extractBlockSequence(HashMap<String, LinkedList<String>> pep2ProtTitleLinesMap,
int peptideWindow, Database db, String center) throws FileNotFoundException, IOException {
ArrayList<String> blockSequenceList = new ArrayList<String>();
int preNPostCenterResidues = peptideWindow/2;
//Database db = new Database(dBPath);
//for a constant time retrieval of protein sequences
HashMap<String,String> protTitleLine2ProtSequenceMap = db.getProtTitleLine2ProtSequenceMap();
Set<String> peps = pep2ProtTitleLinesMap.keySet();
for(String pep: peps){
//get the best of the mapped prot accns
LinkedList<String> tLines = pep2ProtTitleLinesMap.get(pep);
String bestTLine = extractBestTLine(pep, tLines, protTitleLine2ProtSequenceMap);
//ArrayList<Protein> proteins = db.getProteins();
//for(Protein prot: proteins){
//if(prot.getTitleLine().contains(bestAccn)){
//int nxstIndex = getIndex(pep);
AroundSitePeptideObject aroundSitePeptide = new AroundSitePeptideObject(pep, center);
//String protSeq = prot.getSequence();
String protSeq = protTitleLine2ProtSequenceMap.get(bestTLine);
int seq_length = protSeq.length();
int indexOf = protSeq.indexOf(pep);
int protPeptideCenterIndex = indexOf + aroundSitePeptide.getCenterIndex();
String blockPeptide = null;
if (protPeptideCenterIndex > preNPostCenterResidues &&
(((seq_length-1) - protPeptideCenterIndex) >= preNPostCenterResidues) ){ // pre > 50 and post >= 50
blockPeptide = protSeq.substring(protPeptideCenterIndex - preNPostCenterResidues,
protPeptideCenterIndex + preNPostCenterResidues + 1);
//pep = pep100.toCharArray();
blockSequenceList.add(blockPeptide);
}
else if(protPeptideCenterIndex > preNPostCenterResidues &&
(((seq_length-1) - protPeptideCenterIndex) < preNPostCenterResidues)){//pre > 50 and post glycosite < 50
//determine the length of the peptide substring
blockPeptide = protSeq.substring(protPeptideCenterIndex - preNPostCenterResidues);
int lengthOfFill = preNPostCenterResidues - ((seq_length-1) - protPeptideCenterIndex);
blockPeptide = blockPeptide + getFillerString(lengthOfFill);
blockSequenceList.add(blockPeptide);
}
else if(protPeptideCenterIndex <= preNPostCenterResidues &&
(((seq_length-1) - protPeptideCenterIndex) >= preNPostCenterResidues)){// pre <= 50, post > 50
blockPeptide = protSeq.substring(0, protPeptideCenterIndex + preNPostCenterResidues + 1);
int lengthOfFill = preNPostCenterResidues - (protPeptideCenterIndex);
blockPeptide = getFillerString(lengthOfFill) + blockPeptide;
blockSequenceList.add(blockPeptide);
}
else if(seq_length - protPeptideCenterIndex < preNPostCenterResidues &&
protPeptideCenterIndex <= preNPostCenterResidues){//pre <= 50, post < 50
blockPeptide = protSeq;
int preFillLength = preNPostCenterResidues - (protPeptideCenterIndex);
int postFillLength = preNPostCenterResidues - ((seq_length-1) - protPeptideCenterIndex) ;
blockPeptide = getFillerString(preFillLength) + blockPeptide + getFillerString(postFillLength);
blockSequenceList.add(blockPeptide);
}
//}
//}
}
return blockSequenceList;
}
/*
*
*
public LinkedList<JMotifXBlockSequence> extractBlockSequenceObject(HashMap<String, LinkedList<String>> pep2ProtAccnsMap,
int window, String dBPath) throws FileNotFoundException, IOException {
LinkedList<JMotifXBlockSequence> blockSequenceList = new LinkedList<JMotifXBlockSequence>();
Database db = new Database(dBPath);
HashMap<String,String> protTitleLine2ProtSequenceMap = db.getProtTitleLine2ProtSequenceMap();
Set<String> peps = pep2ProtAccnsMap.keySet();
for(String pep: peps){
//get the best of the mapped prot accns
LinkedList<String> accns = pep2ProtAccnsMap.get(pep);
String bestAccn = extractBestTLine(pep, accns, protTitleLine2ProtSequenceMap);
ArrayList<Protein> proteins = db.getProteins();
for(Protein prot: proteins){
if(prot.getTitleLine().contains(bestAccn)){
//int nxstIndex = getIndex(pep);
GlycPepObj glycP = new GlycPepObj(pep);
String protSeq = prot.getSequence();
int seq_length = protSeq.length();
int indexOf = protSeq.indexOf(pep);
int protNXSTIndex = indexOf + glycP.getNXSTIndex();
String blockPeptide = null;
if (protNXSTIndex > window && (((seq_length-1) - protNXSTIndex) >= window) ){ // pre > 50 and post >= 50
blockPeptide = protSeq.substring(protNXSTIndex - window, protNXSTIndex + window + 1);
//pep = pep100.toCharArray();
blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
}
else if(protNXSTIndex > window && (((seq_length-1) - protNXSTIndex) < window)){//pre > 50 and post glycosite < 50
//determine the length of the peptide substring
blockPeptide = protSeq.substring(protNXSTIndex - window);
int lengthOfFill = window - ((seq_length-1) - protNXSTIndex);
blockPeptide = blockPeptide + getFillerString(lengthOfFill);
blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
}
else if(protNXSTIndex <= window && (((seq_length-1) - protNXSTIndex) >= window)){// pre <= 50, post > 50
blockPeptide = protSeq.substring(0, protNXSTIndex + window + 1);
int lengthOfFill = window - (protNXSTIndex);
blockPeptide = getFillerString(lengthOfFill) + blockPeptide;
blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
}
else if(seq_length - protNXSTIndex < window && protNXSTIndex <= window){//pre <= 50, post < 50
blockPeptide = protSeq;
int preFillLength = window - (protNXSTIndex);
int postFillLength = window - ((seq_length-1) - protNXSTIndex) ;
blockPeptide = getFillerString(preFillLength) + blockPeptide + getFillerString(postFillLength);
blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
}
}
}
}
return blockSequenceList;
}
*
*/
private String extractBestTLine(String pep, LinkedList<String> accns, HashMap<String, String> protTitleLine2ProtSequenceMap) {
//throw new UnsupportedOperationException("Not yet implemented");
String best = null;
if(accns.size()==1){
best = accns.get(0);
}else{
best = accns.get(0);
//implement an evaluation of the best
//in this eveluation the first is returned
}
return best;
}
public String getFillerString(int fillerLength){
String fillerString;
char[] fill = new char[fillerLength];
for(int i=0; i < fillerLength; i++){
fill[i] = '-';
}
fillerString = String.valueOf(fill);
return fillerString;
}
public void print(LinkedList<String> blockSequences, String outDir, String inFile) throws FileNotFoundException {
//throw new UnsupportedOperationException("Not yet implemented");
String outFile = outDir + File.separator + new File(inFile).getName().replace(".txt", "") + ".bseqs";
PrintWriter printer = new PrintWriter(outFile);
for(String seq : blockSequences){
printer.println(seq);
}
printer.close();
}
public void printBlockSequenceObject(LinkedList<JMotifXBlockSequence> blockSequences, String outDir, String inFile) throws FileNotFoundException {
//throw new UnsupportedOperationException("Not yet implemented");
String outFile = outDir + File.separator + new File(inFile).getName().replace(".txt", "") + ".bseqs.plus";
PrintWriter printer = new PrintWriter(outFile);
//print header
printer.println("BlockSequence\tPeptide\tNXSTLocation\tProteinAccn");
for(JMotifXBlockSequence seq : blockSequences){
printer.println(seq.getBlock() + "\t" +
seq.getPeptide() + "\t" +
seq.getNxstLocation() + "\t" +
seq.getProteinAcc());
}
printer.close();
}
private ArrayList<String> extractMotifContainingSequences(ArrayList<String> inputSequences, String prespecifiedMotif) {
//throw new UnsupportedOperationException("Not yet implemented");
ArrayList<String> motifSequences = new ArrayList<String>();
for(String sequence : inputSequences ){
if(sequence.matches(prespecifiedMotif))
motifSequences.add(sequence);
}
return motifSequences;
}
public ArrayList<String> extractRandomBlockSequences(HashMap<String, String> configMap, Database db)
throws FileNotFoundException, IOException {
//throw new UnsupportedOperationException("Not yet implemented");
ArrayList<String> randomBlockSequences = new ArrayList<String>();
int peptideWindow = Integer.parseInt(configMap.get("peptideWindow"));
String centerResidue = configMap.get("centerResidue");
AminoAcid[] aa = db.getAminoAcids();
// to determine the number of random sequences to generate,
// extract the number of input peptide sequences, and multiply by a factor of 10 [default];
String inputPeptideSequenceFile = configMap.get("peptideSequenceFile");
JMotifXSequenceFileReader fR = new JMotifXSequenceFileReader();
int inputPeptides = fR.extractSequences(inputPeptideSequenceFile).size();
int randoms = inputPeptides * 10;
int preNPostCenterLength = peptideWindow/2;
for(int i = 0 ; i < randoms; i++){
String randomBlockSequence;
//generate random
Random random = new Random();
char[] seqCharArr = new char[((preNPostCenterLength * 2) + 1)];
for(int j = 0; j < seqCharArr.length; j++){
if(j == preNPostCenterLength + 1){
char symb = centerResidue.charAt(0);
seqCharArr[j] = symb;
} else {
char symb = aa[random.nextInt(aa.length)].getSymbol();
seqCharArr[j] = symb;
}
}
randomBlockSequence = String.valueOf(seqCharArr);
randomBlockSequences.add(randomBlockSequence);
}
return randomBlockSequences;
}
}