Package bgu.bio.compression

Source Code of bgu.bio.compression.SequenceMatchUp

package bgu.bio.compression;

import gnu.trove.list.linked.TIntLinkedList;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Set;

import javax.sound.midi.Sequence;

/**
* This class finds matches (as long as possible) between two strings
* @author Nitzan
*/
public class SequenceMatchUp {

  /**
   * The two string in which we will look for matches
   */
  private String s1, s2;
 
  /**
   * The list of the match's beginning indices on the two strings
   */
  private List<StringRange> indices;
 
  /**
   * Minimal length for match (k-tuple)
   */
  private int k;
 
  /**
   *
   */
  private final static int BAND_WIDTH = 50000;
 
 
  private int start1;
  private int start2;
  private int end1;
  private int end2;
 
  /**
   * Constructor
   * @param s1 first string
   * @param s2 second string
   */
  public SequenceMatchUp(String s1, String s2, int k){
    this(s1, 0, s1.length()-1, s2, 0, s2.length()-1, k);
  }
 
 
  /**
   * Constructor
   * @param s1 first string
   * @param s2 second string
   */
  public SequenceMatchUp(String s1, int b1, int e1, String s2, int b2, int e2, int k){
    this.s1 = s1;
    this.s2 = s2;
    this.indices = new ArrayList<StringRange>();
    this.k = k;
    this.start1 = b1;
    this.start2 = b2;
    this.end1 = e1;
    this.end2 = e2;
  }
 
 
  /**
   * Creates matches of length >= k and insert them to the indices list
   */
  public void matchKTuples(){
   
    HashMap<String, TIntLinkedList> s1Map = new HashMap<String, TIntLinkedList>();
    String s;
    TIntLinkedList l1;
   
    //map k-tuples in s1
    for(int i=start1; i<=(end1+1-k); i++){
      s = s1.substring(i, i+k);
     
      if(s1Map.containsKey(s)){
        l1 = s1Map.get(s);
        l1.add(i);
      }
      else{
        l1 = new TIntLinkedList();
        l1.add(i);
        s1Map.put(s, l1);
      }
    }
   
    //the fornt list - will be updated as we're moving along s2
    List<StringRange> front = new ArrayList<StringRange>();
   
   
    //lookup k-tuples in s2
    for(int i=start2; i<=(end2+1-k); i++){
     
      s = s2.substring(i, i+k);
           
      l1 = s1Map.get(s);
     
     
      if(l1 == null){ //no matches for this key in s1Map
        for(int p=0; p<front.size(); p++){
          indices.add(front.get(p)); //stop point for every range on the front list
        }
        front.clear();
      }
      else{
        //create new front list according the previous front list and l1
        List<StringRange> newFront = new ArrayList<StringRange>();
        int l1Pos = 0;
        int frontPos = 0;
       
        //move over the front list (newFront) and the list of indices from s1 (l1) and act according to the case:
        while(l1Pos < l1.size() && frontPos < front.size()){
         
          int l1Element = l1.get(l1Pos);
          StringRange frontElement = front.get(frontPos);
         
          //increase k-tuple length by 1
          if(l1Element - frontElement.getI1() == frontElement.getLength()-k+1){
            newFront.add(new StringRange(frontElement.getI1(), frontElement.getI2(), frontElement.getLength()+1));
            l1Pos++;
            frontPos++;
          }
          //add new k-tuple indices to the front list
          else if(l1Element - frontElement.getI1() < frontElement.getLength()-k+1){
            newFront.add(new StringRange(l1Element, i, k));
            l1Pos++;
          }
          //end of k-tuple - can't get any longer
          else{
            indices.add(frontElement);
            frontPos++;
          }
         
        }
       
        //deal with cases of unfinished lists:
       
        while(l1Pos < l1.size()){ //meaning the list of s1 indices is not over
          int l1Element = l1.get(l1Pos);
          newFront.add(new StringRange(l1Element, i, k));
          l1Pos++;
        }
       
        while(frontPos < front.size()){ //meaning the front list is not over
          StringRange frontElement = front.get(frontPos);
          indices.add(frontElement);
          frontPos++;
        }
       
        front = newFront; 
      }     
    }
   
    indices.addAll(front);
  }
 
  /**
   * Get the indices list size
   * @return the indices list size
   */
  public int getSize(){
    return indices.size();
  }
 
  /**
   * Get the indices list
   * @return the indices list
   */
  public List<StringRange> getIndices() {
    return indices;
  }
 
 
  /**
   * Filters the indices list according to the band width parameter
   */
  public void filterIndices(){
   
    int middle = s2.length()/2;
   
    //minimal and maximal column index of the band
    int min = Math.min(middle - BAND_WIDTH/2, s2.length()-k);
    int max = Math.max(middle + BAND_WIDTH/2, 0);
   
    List<StringRange> toRemove = new ArrayList<StringRange>();
   
    //search the unfit indices and add them to the collection which needs to be removed
    StringRange sr;
    int i;
    for(i=0; i<indices.size(); i++){
      sr = indices.get(i);
      if(sr.getI2()<min || sr.getI2()>max)
        toRemove.add(sr);
    }
    //remove the unfit string ranges from the indices list
    indices.removeAll(toRemove);
  }
 
  /**
   * Sorts the indices list according to the lengths of the sequences which were matched up. Primary key for sorting: sequence's length,
   * secondary key: beginning index on s1.
   */
  public void sortIndicesList(){
    Collections.sort(indices, new StringRange.ComparingSR());
  }
 
  /**
   * Finds the first element from which the scan of suitable StringRanges should start, when looking for fitting diagonals
   * @param i1s1 beginning index of s1
   * @param i2s1 ending index of s1
   * @param i1s2 beginning index of s2
   * @param i2s2 ending index of s2
   * @return the locations of the first and last suitable StringRanges
   */
  public int[] lookupRelevantPart(int i1s1, int i2s1, int i1s2, int i2s2){
    int[] ans = new int[2];
    StringRange curr;
    int i, j, length, b1, b2, e1, e2;
    for(i=0; i<indices.size(); i++){
      curr = indices.get(i);
      b1 = curr.getI1(); //beginning index of the matches sequence on s1
      b2 = curr.getI2(); //beginning index of the matches sequence on s2
      length = curr.getLength();
      e1 = b1 + length; //ending index of the matches sequence on s1
      e2 = b2 + length; //ending index of the matches sequence on s2
      if((((b1>=i1s1) && (b1<=i2s1)) && ((b2>=i1s2) && (b2<=i2s2))) || (((e1>=i1s1) && (e1<=i2s1)) && ((e2>=i1s2) && (e2<=i2s2)))){
        ans[0] = i;
        ans[1] = i;
        for(j=i+1; j<indices.size(); j++){
          if(!((((b1>=i1s1) && (b1<=i2s1)) && ((b2>=i1s2) && (b2<=i2s2))) || (((e1>=i1s1) && (e1<=i2s1)) && ((e2>=i1s2) && (e2<=i2s2))))){
            ans[1] = j-1;
          }
        }
      }
    }
    return ans;
  }
 
 
 
  /**
   * Main
   */
  public static void main (String args[]){
   
    FileReader input1=null, input2=null;

    //reading s1 from a file
    StringBuffer s1 = new StringBuffer();
   
    try {
      input1 = new FileReader(args[0]); //get the argument
    } catch (FileNotFoundException e) {
      System.out.println("where is file 1?");
      e.printStackTrace();
    }
    BufferedReader buf1 = new BufferedReader(input1);
    String lineFromF1;

    try {
      lineFromF1 = buf1.readLine();
      while(lineFromF1!=null){
        s1.append(lineFromF1); //connects the string to the StringBuffer
        lineFromF1 = buf1.readLine();
      }
      buf1.close(); //close the buffered reader
     
    } catch (IOException e1) {
      System.out.println("IO problem in F1");
      e1.printStackTrace();
    }
    String str1 = s1.toString(); //cast the StringBuffer to string, so we could handle it in the DiagoalSequenceAlignment class
   
    //same process for s2...
    StringBuffer s2 = new StringBuffer();
    try {
      input2 = new FileReader(args[1]);
    } catch (FileNotFoundException e) {
      System.out.println("where is file 2?");
      e.printStackTrace();
    }

    BufferedReader buf2 = new BufferedReader(input2);
    String lineFromF2;

    try {
      lineFromF2 = buf2.readLine();
      while(lineFromF2!=null){
        s2.append(lineFromF2); //connects the string to the StringBuffer
        lineFromF2 = buf2.readLine();
      }
      buf2.close(); //close the buffered reader
     
    } catch (IOException e1) {
      System.out.println("IO problem in F2");
      e1.printStackTrace();
    }
    String str2 = s2.toString(); //cast the StringBuffer to string, so we could handle it in the DiagoalSequenceAlignment class
   
    int k = Integer.parseInt(args[2]);
   
    System.out.println(new Date());
   
    SequenceMatchUp c = new SequenceMatchUp(str2, 15412, 15917, str1, 15411, 16187, k);
    //SequenceMatchUp c = new SequenceMatchUp(str1, 8, 14, str2, 8, 14, 2);//new SequenceMatchUp(str2, str1, k);
    c.matchKTuples();
  //  System.out.println("after matchKTup: number of common substrings of length of at least " + k + " is: " + c.getSize());   
    //c.filterIndices();
  //  System.out.println("after filterIndices: number of common substrings of length of at least " + k + " is: " + c.getSize());
    c.sortIndicesList();
    //System.out.println("after sort: number of common substrings of length of at least " + k + " is: " + c.getSize());
   
    List<StringRange> list = c.getIndices();
  /*  for(int i=0; i<list.size(); i++){
      System.out.println("i = " + i + " length = " + list.get(i).getLength());
    }*/
    int sum = 0;
    try {
      FileWriter fw = new FileWriter("smu_outputEXAMPLE" + k + ".txt");
      BufferedWriter bw = new BufferedWriter(fw);
      for (StringRange r : c.getIndices()) {
        bw.write("S1 pos: " + r.getI1() + ", S2 pos: " + r.getI2() + " of length " + r.getLength());
        sum+=r.getLength();
        bw.newLine();
      }
      bw.write("number of string ranges found: " + c.getSize());
      bw.newLine();
      bw.write("sum of all the matches: " + sum);
      bw.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
   
   

   
    //System.out.println("number of common substrings of length of at least " + k + " is: " + c.getSize());
    //System.out.println("S1 length " + str1.length() + ", S2 length " + str2.length());
   
    System.out.println(new Date());
  }
 
 
}
TOP

Related Classes of bgu.bio.compression.SequenceMatchUp

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.