Package jmotifx.preprocess

Source Code of jmotifx.preprocess.JMotifXBlockSequenceExtractor

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package jmotifx.preprocess;

import database.AminoAcid;
import database.Database;
import database.Protein;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import jmotifx.JMotifXSequenceFileReader;
import jmotifx.sequenceobjects.AroundSitePeptideObject;

/**
*
* @author paiyeta1
*/
public class JMotifXBlockSequenceExtractor {
   
    public ArrayList<String>  extractBlockSeqs4rmDBWithMotifNCentralResidue(HashMap<String, String> configMap, Database db) {
        //throw new UnsupportedOperationException("Not yet implemented");
        ArrayList<String> blockSequences = new ArrayList<String>();
        String motifRegularExpression = configMap.get("motif");
        String centralResidue = configMap.get("centerResidue");
        int peptideWindow = Integer.parseInt(configMap.get("peptideWindow"));
        ArrayList<Protein> proteins = db.getProteins();
        //for each protein, extract central residue indeces
        for(Protein protein : proteins){
            ArrayList<Integer> residueIndeces = protein.getResidueIndeces(centralResidue);
            //for each residue index get block sequence around it
            for(int residueIndex : residueIndeces){
                String proteinSequence = protein.getSequence();
                String blockSequence = extractBlockSequence(proteinSequence, residueIndex, peptideWindow);
                //if block sequence matches prespecified motif, add to the list of bloc sequences
                if(blockSequence.matches(motifRegularExpression))
                    blockSequences.add(blockSequence);
            }           
        }
        return blockSequences;
    }
   
    /*
     * Extracts a peptide sequence about a given residue index from a protein sequence.
     * @param proteinSequence
     * @param index
     * @param peptideWindow
     * @return blockSequence
     */
    public String extractBlockSequence(String proteinSequence, int index, int peptideWindow){
        String blockSequence = null;
        int sequenceLength = proteinSequence.length();
        int preNPostCenterLength = peptideWindow/2;
                   
        if (index > preNPostCenterLength && (((sequenceLength-1) - index) >= preNPostCenterLength) ){ // pre > 50 and post >= 50
            blockSequence = proteinSequence.substring(index - preNPostCenterLength, index + preNPostCenterLength + 1);           
        }
        else if(index > preNPostCenterLength && (((sequenceLength-1) - index) < preNPostCenterLength)){//pre > 50 and post glycosite < 50
            //determine the length of the peptide substring
            blockSequence =  proteinSequence.substring(index - preNPostCenterLength);
            int lengthOfFill = preNPostCenterLength - ((sequenceLength-1) - index);
            blockSequence = blockSequence + getFillerString(lengthOfFill);          
        }
        else if(index <= preNPostCenterLength && (((sequenceLength-1) - index) >= preNPostCenterLength)){// pre <= 50, post > 50
            blockSequence = proteinSequence.substring(0, index + preNPostCenterLength + 1);
            int lengthOfFill = preNPostCenterLength - (index);
            blockSequence = getFillerString(lengthOfFill) + blockSequence;
           
        }
        else if(sequenceLength - index < preNPostCenterLength && index <= preNPostCenterLength){//pre <= 50, post < 50
            blockSequence = proteinSequence;
            int preFillLength = preNPostCenterLength - (index);
            int postFillLength = preNPostCenterLength - ((sequenceLength-1) - index) ;
            blockSequence = getFillerString(preFillLength) + blockSequence + getFillerString(postFillLength);           
        }
        return blockSequence;
    }

   
    //newer implementation of block sequence extractor routine - 20140417
    public ArrayList<String> extractBlockSequence(ArrayList<String> inputSequences,
                                            HashMap<String, String> configMap, Database db) throws FileNotFoundException,
                                                                                                            IOException {
       
        //throw new UnsupportedOperationException("Not yet implemented");
        String prespecifiedMotif = configMap.get("motif"); // get prespecified motif
        String centerResidue = configMap.get("centerResidue"); // get residue about which sequence should be centered...
        int peptideWindow = Integer.parseInt(configMap.get("peptideWindow")); // get peptide preNPostCenterLength
        // presupposing a motif is prespecified.
        ArrayList<String> prespecifiedMotifContainingSequences =
                extractMotifContainingSequences(inputSequences,prespecifiedMotif);
        ArrayList<String> blockSequences =
                extractBlockSequenceThisWindow(prespecifiedMotifContainingSequences, peptideWindow, db, centerResidue);
        return blockSequences;
    }

   
    public ArrayList<String> extractBlockSequenceThisWindow(ArrayList<String> sequences, int window, Database db, String center)
            throws FileNotFoundException, IOException{
        HashMap<String, LinkedList<String>> pep2ProtAccnsMap = mapPep2ProteinTitleLines(sequences, db);
        ArrayList<String> blockSequence = extractBlockSequence(pep2ProtAccnsMap, window, db, center);     
        return blockSequence;
    }
   
    private HashMap<String, LinkedList<String>> mapPep2ProteinTitleLines(ArrayList<String> glycs, Database db) {
        System.out.println(" Mapping identified glycosites to protein accessions...");
        HashMap<String, LinkedList<String>> pep2ProtTitleLinesMap = new HashMap<String, LinkedList<String>>();
        for(String glyc : glycs){
            glyc = glyc.replaceAll("[^A-Z]","");
            LinkedList<String> protTitleLines = getMappedProtTitleLines(glyc, db);
            pep2ProtTitleLinesMap.put(glyc, protTitleLines);
        }      
        return pep2ProtTitleLinesMap;
    }

    private LinkedList<String> getMappedProtTitleLines(String glyc, Database db) {
        //throw new UnsupportedOperationException("Not yet implemented");
        LinkedList<String> mappedProtTitleLines = new LinkedList<String>();
        ArrayList<Protein> proteins = db.getProteins();
        for(Protein prot : proteins){
            if(prot.getSequence().contains(glyc)){
                mappedProtTitleLines.add(prot.getTitleLine()); //the implemented part
            }
        }      
        return mappedProtTitleLines;      
    }
    public ArrayList<String> extractBlockSequence(HashMap<String, LinkedList<String>> pep2ProtTitleLinesMap,
                                                        int peptideWindow, Database db, String center) throws FileNotFoundException, IOException {
        ArrayList<String> blockSequenceList = new ArrayList<String>();
        int preNPostCenterResidues = peptideWindow/2;
        //Database db = new Database(dBPath);
       
        //for a constant time retrieval of protein sequences
        HashMap<String,String> protTitleLine2ProtSequenceMap = db.getProtTitleLine2ProtSequenceMap();
       
        Set<String> peps = pep2ProtTitleLinesMap.keySet();
        for(String pep: peps){
            //get the best of the mapped prot accns
            LinkedList<String> tLines = pep2ProtTitleLinesMap.get(pep);
            String bestTLine = extractBestTLine(pep, tLines, protTitleLine2ProtSequenceMap);
            //ArrayList<Protein> proteins = db.getProteins();
            //for(Protein prot: proteins){
           
                //if(prot.getTitleLine().contains(bestAccn)){
                    //int nxstIndex = getIndex(pep);
                    AroundSitePeptideObject aroundSitePeptide = new AroundSitePeptideObject(pep, center);
                    //String protSeq = prot.getSequence();
                    String protSeq = protTitleLine2ProtSequenceMap.get(bestTLine);
                    int seq_length = protSeq.length();
                    int indexOf = protSeq.indexOf(pep);
                    int protPeptideCenterIndex = indexOf + aroundSitePeptide.getCenterIndex();
                   
                    String blockPeptide = null;
                    if (protPeptideCenterIndex > preNPostCenterResidues &&
                            (((seq_length-1) - protPeptideCenterIndex) >= preNPostCenterResidues) ){ // pre > 50 and post >= 50
                        blockPeptide = protSeq.substring(protPeptideCenterIndex - preNPostCenterResidues,
                                protPeptideCenterIndex + preNPostCenterResidues + 1);
                        //pep = pep100.toCharArray();
                        blockSequenceList.add(blockPeptide);
                    }
                    else if(protPeptideCenterIndex > preNPostCenterResidues &&
                            (((seq_length-1) - protPeptideCenterIndex) < preNPostCenterResidues)){//pre > 50 and post glycosite < 50
                        //determine the length of the peptide substring
                        blockPeptide =  protSeq.substring(protPeptideCenterIndex - preNPostCenterResidues);
                        int lengthOfFill = preNPostCenterResidues - ((seq_length-1) - protPeptideCenterIndex);
                        blockPeptide = blockPeptide + getFillerString(lengthOfFill);
                        blockSequenceList.add(blockPeptide);
                    }
                    else if(protPeptideCenterIndex <= preNPostCenterResidues &&
                            (((seq_length-1) - protPeptideCenterIndex) >= preNPostCenterResidues)){// pre <= 50, post > 50
                        blockPeptide = protSeq.substring(0, protPeptideCenterIndex + preNPostCenterResidues + 1);
                        int lengthOfFill = preNPostCenterResidues - (protPeptideCenterIndex);
                        blockPeptide = getFillerString(lengthOfFill) + blockPeptide;
                        blockSequenceList.add(blockPeptide);
                    }
                    else if(seq_length - protPeptideCenterIndex < preNPostCenterResidues &&
                            protPeptideCenterIndex <= preNPostCenterResidues){//pre <= 50, post < 50
                        blockPeptide = protSeq;
                        int preFillLength = preNPostCenterResidues - (protPeptideCenterIndex);
                        int postFillLength = preNPostCenterResidues - ((seq_length-1) - protPeptideCenterIndex) ;
                        blockPeptide = getFillerString(preFillLength) + blockPeptide + getFillerString(postFillLength);
                        blockSequenceList.add(blockPeptide);
                    }
                //}
            //}
        }      
        return blockSequenceList;
    }
   
    /*
     *
     *
    public LinkedList<JMotifXBlockSequence> extractBlockSequenceObject(HashMap<String, LinkedList<String>> pep2ProtAccnsMap,
                                                        int window, String dBPath) throws FileNotFoundException, IOException {
        LinkedList<JMotifXBlockSequence> blockSequenceList = new LinkedList<JMotifXBlockSequence>();
        Database db = new Database(dBPath);
        HashMap<String,String> protTitleLine2ProtSequenceMap = db.getProtTitleLine2ProtSequenceMap();
       
        Set<String> peps = pep2ProtAccnsMap.keySet();
        for(String pep: peps){
            //get the best of the mapped prot accns
            LinkedList<String> accns = pep2ProtAccnsMap.get(pep);
            String bestAccn = extractBestTLine(pep, accns, protTitleLine2ProtSequenceMap);
            ArrayList<Protein> proteins = db.getProteins();
            for(Protein prot: proteins){
                if(prot.getTitleLine().contains(bestAccn)){
                    //int nxstIndex = getIndex(pep);
                    GlycPepObj glycP = new GlycPepObj(pep);
                    String protSeq = prot.getSequence();
                    int seq_length = protSeq.length();
                    int indexOf = protSeq.indexOf(pep);
                    int protNXSTIndex = indexOf + glycP.getNXSTIndex();
                   
                    String blockPeptide = null;
                    if (protNXSTIndex > window && (((seq_length-1) - protNXSTIndex) >= window) ){ // pre > 50 and post >= 50
                        blockPeptide = protSeq.substring(protNXSTIndex - window, protNXSTIndex + window + 1);
                        //pep = pep100.toCharArray();
                        blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
                    }
                    else if(protNXSTIndex > window && (((seq_length-1) - protNXSTIndex) < window)){//pre > 50 and post glycosite < 50
                        //determine the length of the peptide substring
                        blockPeptide =  protSeq.substring(protNXSTIndex - window);
                        int lengthOfFill = window - ((seq_length-1) - protNXSTIndex);
                        blockPeptide = blockPeptide + getFillerString(lengthOfFill);
                        blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
                    }
                    else if(protNXSTIndex <= window && (((seq_length-1) - protNXSTIndex) >= window)){// pre <= 50, post > 50
                        blockPeptide = protSeq.substring(0, protNXSTIndex + window + 1);
                        int lengthOfFill = window - (protNXSTIndex);
                        blockPeptide = getFillerString(lengthOfFill) + blockPeptide;
                        blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
                    }
                    else if(seq_length - protNXSTIndex < window && protNXSTIndex <= window){//pre <= 50, post < 50
                        blockPeptide = protSeq;
                        int preFillLength = window - (protNXSTIndex);
                        int postFillLength = window - ((seq_length-1) - protNXSTIndex) ;
                        blockPeptide = getFillerString(preFillLength) + blockPeptide + getFillerString(postFillLength);
                        blockSequenceList.add(new JMotifXBlockSequence(blockPeptide, pep, bestAccn, protNXSTIndex));
                    }
                }
            }
        }      
        return blockSequenceList;
    }
    *
    */
  
     private String extractBestTLine(String pep, LinkedList<String> accns, HashMap<String, String> protTitleLine2ProtSequenceMap) {
        //throw new UnsupportedOperationException("Not yet implemented");
        String best = null;
        if(accns.size()==1){
            best = accns.get(0);
        }else{
            best = accns.get(0);
            //implement an evaluation of the best
            //in this eveluation the first is returned
        }      
        return best;
    }
   
    public String getFillerString(int fillerLength){
        String fillerString;
        char[] fill = new char[fillerLength];
        for(int i=0; i < fillerLength; i++){
            fill[i] = '-';
        }
        fillerString = String.valueOf(fill);       
        return fillerString;
    }
       
    public void print(LinkedList<String> blockSequences, String outDir, String inFile) throws FileNotFoundException {
        //throw new UnsupportedOperationException("Not yet implemented");
        String outFile = outDir + File.separator + new File(inFile).getName().replace(".txt", "") + ".bseqs";
        PrintWriter printer = new PrintWriter(outFile);
        for(String seq : blockSequences){
            printer.println(seq);
        }
        printer.close();      
    }
   
    public void printBlockSequenceObject(LinkedList<JMotifXBlockSequence> blockSequences, String outDir, String inFile) throws FileNotFoundException {
        //throw new UnsupportedOperationException("Not yet implemented");
        String outFile = outDir + File.separator + new File(inFile).getName().replace(".txt", "") + ".bseqs.plus";
        PrintWriter printer = new PrintWriter(outFile);
        //print header
        printer.println("BlockSequence\tPeptide\tNXSTLocation\tProteinAccn");
        for(JMotifXBlockSequence seq : blockSequences){
            printer.println(seq.getBlock() + "\t" +
                            seq.getPeptide() + "\t" +
                            seq.getNxstLocation() + "\t" +
                            seq.getProteinAcc());
        }
        printer.close();      
    }

    private ArrayList<String> extractMotifContainingSequences(ArrayList<String> inputSequences, String prespecifiedMotif) {
        //throw new UnsupportedOperationException("Not yet implemented");
        ArrayList<String> motifSequences = new ArrayList<String>();
        for(String sequence : inputSequences ){
            if(sequence.matches(prespecifiedMotif))
                motifSequences.add(sequence);
        }
        return motifSequences;
    }

    public ArrayList<String> extractRandomBlockSequences(HashMap<String, String> configMap, Database db)
            throws FileNotFoundException, IOException {
        //throw new UnsupportedOperationException("Not yet implemented");
        ArrayList<String> randomBlockSequences = new ArrayList<String>();
        int peptideWindow = Integer.parseInt(configMap.get("peptideWindow"));
        String centerResidue = configMap.get("centerResidue");
        AminoAcid[] aa = db.getAminoAcids();
        // to determine the number of random sequences to generate,
        // extract the number of input peptide sequences, and multiply by a factor of 10 [default];
        String inputPeptideSequenceFile = configMap.get("peptideSequenceFile");
        JMotifXSequenceFileReader fR = new JMotifXSequenceFileReader();
        int inputPeptides = fR.extractSequences(inputPeptideSequenceFile).size();
        int randoms = inputPeptides * 10;
        int preNPostCenterLength = peptideWindow/2;
        for(int i = 0 ; i < randoms; i++){
            String randomBlockSequence;
            //generate random
            Random random = new Random();
            char[] seqCharArr = new char[((preNPostCenterLength * 2) + 1)];
            for(int j = 0; j < seqCharArr.length; j++){
                if(j == preNPostCenterLength + 1){
                    char symb = centerResidue.charAt(0);
                    seqCharArr[j] = symb;
                } else {
                    char symb = aa[random.nextInt(aa.length)].getSymbol();
                    seqCharArr[j] = symb;
                }
            }
            randomBlockSequence = String.valueOf(seqCharArr);
            randomBlockSequences.add(randomBlockSequence);
        }     
        return randomBlockSequences;
    }
  
}
TOP

Related Classes of jmotifx.preprocess.JMotifXBlockSequenceExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.