package bgu.bio.compression;
import gnu.trove.list.linked.TIntLinkedList;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import javax.sound.midi.Sequence;
/**
* This class finds matches (as long as possible) between two strings
* @author Nitzan
*/
public class SequenceMatchUp {
/**
* The two string in which we will look for matches
*/
private String s1, s2;
/**
* The list of the match's beginning indices on the two strings
*/
private List<StringRange> indices;
/**
* Minimal length for match (k-tuple)
*/
private int k;
/**
*
*/
private final static int BAND_WIDTH = 50000;
private int start1;
private int start2;
private int end1;
private int end2;
/**
* Constructor
* @param s1 first string
* @param s2 second string
*/
public SequenceMatchUp(String s1, String s2, int k){
this(s1, 0, s1.length()-1, s2, 0, s2.length()-1, k);
}
/**
* Constructor
* @param s1 first string
* @param s2 second string
*/
public SequenceMatchUp(String s1, int b1, int e1, String s2, int b2, int e2, int k){
this.s1 = s1;
this.s2 = s2;
this.indices = new ArrayList<StringRange>();
this.k = k;
this.start1 = b1;
this.start2 = b2;
this.end1 = e1;
this.end2 = e2;
}
/**
* Creates matches of length >= k and insert them to the indices list
*/
public void matchKTuples(){
HashMap<String, TIntLinkedList> s1Map = new HashMap<String, TIntLinkedList>();
String s;
TIntLinkedList l1;
//map k-tuples in s1
for(int i=start1; i<=(end1+1-k); i++){
s = s1.substring(i, i+k);
if(s1Map.containsKey(s)){
l1 = s1Map.get(s);
l1.add(i);
}
else{
l1 = new TIntLinkedList();
l1.add(i);
s1Map.put(s, l1);
}
}
//the fornt list - will be updated as we're moving along s2
List<StringRange> front = new ArrayList<StringRange>();
//lookup k-tuples in s2
for(int i=start2; i<=(end2+1-k); i++){
s = s2.substring(i, i+k);
l1 = s1Map.get(s);
if(l1 == null){ //no matches for this key in s1Map
for(int p=0; p<front.size(); p++){
indices.add(front.get(p)); //stop point for every range on the front list
}
front.clear();
}
else{
//create new front list according the previous front list and l1
List<StringRange> newFront = new ArrayList<StringRange>();
int l1Pos = 0;
int frontPos = 0;
//move over the front list (newFront) and the list of indices from s1 (l1) and act according to the case:
while(l1Pos < l1.size() && frontPos < front.size()){
int l1Element = l1.get(l1Pos);
StringRange frontElement = front.get(frontPos);
//increase k-tuple length by 1
if(l1Element - frontElement.getI1() == frontElement.getLength()-k+1){
newFront.add(new StringRange(frontElement.getI1(), frontElement.getI2(), frontElement.getLength()+1));
l1Pos++;
frontPos++;
}
//add new k-tuple indices to the front list
else if(l1Element - frontElement.getI1() < frontElement.getLength()-k+1){
newFront.add(new StringRange(l1Element, i, k));
l1Pos++;
}
//end of k-tuple - can't get any longer
else{
indices.add(frontElement);
frontPos++;
}
}
//deal with cases of unfinished lists:
while(l1Pos < l1.size()){ //meaning the list of s1 indices is not over
int l1Element = l1.get(l1Pos);
newFront.add(new StringRange(l1Element, i, k));
l1Pos++;
}
while(frontPos < front.size()){ //meaning the front list is not over
StringRange frontElement = front.get(frontPos);
indices.add(frontElement);
frontPos++;
}
front = newFront;
}
}
indices.addAll(front);
}
/**
* Get the indices list size
* @return the indices list size
*/
public int getSize(){
return indices.size();
}
/**
* Get the indices list
* @return the indices list
*/
public List<StringRange> getIndices() {
return indices;
}
/**
* Filters the indices list according to the band width parameter
*/
public void filterIndices(){
int middle = s2.length()/2;
//minimal and maximal column index of the band
int min = Math.min(middle - BAND_WIDTH/2, s2.length()-k);
int max = Math.max(middle + BAND_WIDTH/2, 0);
List<StringRange> toRemove = new ArrayList<StringRange>();
//search the unfit indices and add them to the collection which needs to be removed
StringRange sr;
int i;
for(i=0; i<indices.size(); i++){
sr = indices.get(i);
if(sr.getI2()<min || sr.getI2()>max)
toRemove.add(sr);
}
//remove the unfit string ranges from the indices list
indices.removeAll(toRemove);
}
/**
* Sorts the indices list according to the lengths of the sequences which were matched up. Primary key for sorting: sequence's length,
* secondary key: beginning index on s1.
*/
public void sortIndicesList(){
Collections.sort(indices, new StringRange.ComparingSR());
}
/**
* Finds the first element from which the scan of suitable StringRanges should start, when looking for fitting diagonals
* @param i1s1 beginning index of s1
* @param i2s1 ending index of s1
* @param i1s2 beginning index of s2
* @param i2s2 ending index of s2
* @return the locations of the first and last suitable StringRanges
*/
public int[] lookupRelevantPart(int i1s1, int i2s1, int i1s2, int i2s2){
int[] ans = new int[2];
StringRange curr;
int i, j, length, b1, b2, e1, e2;
for(i=0; i<indices.size(); i++){
curr = indices.get(i);
b1 = curr.getI1(); //beginning index of the matches sequence on s1
b2 = curr.getI2(); //beginning index of the matches sequence on s2
length = curr.getLength();
e1 = b1 + length; //ending index of the matches sequence on s1
e2 = b2 + length; //ending index of the matches sequence on s2
if((((b1>=i1s1) && (b1<=i2s1)) && ((b2>=i1s2) && (b2<=i2s2))) || (((e1>=i1s1) && (e1<=i2s1)) && ((e2>=i1s2) && (e2<=i2s2)))){
ans[0] = i;
ans[1] = i;
for(j=i+1; j<indices.size(); j++){
if(!((((b1>=i1s1) && (b1<=i2s1)) && ((b2>=i1s2) && (b2<=i2s2))) || (((e1>=i1s1) && (e1<=i2s1)) && ((e2>=i1s2) && (e2<=i2s2))))){
ans[1] = j-1;
}
}
}
}
return ans;
}
/**
* Main
*/
public static void main (String args[]){
FileReader input1=null, input2=null;
//reading s1 from a file
StringBuffer s1 = new StringBuffer();
try {
input1 = new FileReader(args[0]); //get the argument
} catch (FileNotFoundException e) {
System.out.println("where is file 1?");
e.printStackTrace();
}
BufferedReader buf1 = new BufferedReader(input1);
String lineFromF1;
try {
lineFromF1 = buf1.readLine();
while(lineFromF1!=null){
s1.append(lineFromF1); //connects the string to the StringBuffer
lineFromF1 = buf1.readLine();
}
buf1.close(); //close the buffered reader
} catch (IOException e1) {
System.out.println("IO problem in F1");
e1.printStackTrace();
}
String str1 = s1.toString(); //cast the StringBuffer to string, so we could handle it in the DiagoalSequenceAlignment class
//same process for s2...
StringBuffer s2 = new StringBuffer();
try {
input2 = new FileReader(args[1]);
} catch (FileNotFoundException e) {
System.out.println("where is file 2?");
e.printStackTrace();
}
BufferedReader buf2 = new BufferedReader(input2);
String lineFromF2;
try {
lineFromF2 = buf2.readLine();
while(lineFromF2!=null){
s2.append(lineFromF2); //connects the string to the StringBuffer
lineFromF2 = buf2.readLine();
}
buf2.close(); //close the buffered reader
} catch (IOException e1) {
System.out.println("IO problem in F2");
e1.printStackTrace();
}
String str2 = s2.toString(); //cast the StringBuffer to string, so we could handle it in the DiagoalSequenceAlignment class
int k = Integer.parseInt(args[2]);
System.out.println(new Date());
SequenceMatchUp c = new SequenceMatchUp(str2, 15412, 15917, str1, 15411, 16187, k);
//SequenceMatchUp c = new SequenceMatchUp(str1, 8, 14, str2, 8, 14, 2);//new SequenceMatchUp(str2, str1, k);
c.matchKTuples();
// System.out.println("after matchKTup: number of common substrings of length of at least " + k + " is: " + c.getSize());
//c.filterIndices();
// System.out.println("after filterIndices: number of common substrings of length of at least " + k + " is: " + c.getSize());
c.sortIndicesList();
//System.out.println("after sort: number of common substrings of length of at least " + k + " is: " + c.getSize());
List<StringRange> list = c.getIndices();
/* for(int i=0; i<list.size(); i++){
System.out.println("i = " + i + " length = " + list.get(i).getLength());
}*/
int sum = 0;
try {
FileWriter fw = new FileWriter("smu_outputEXAMPLE" + k + ".txt");
BufferedWriter bw = new BufferedWriter(fw);
for (StringRange r : c.getIndices()) {
bw.write("S1 pos: " + r.getI1() + ", S2 pos: " + r.getI2() + " of length " + r.getLength());
sum+=r.getLength();
bw.newLine();
}
bw.write("number of string ranges found: " + c.getSize());
bw.newLine();
bw.write("sum of all the matches: " + sum);
bw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//System.out.println("number of common substrings of length of at least " + k + " is: " + c.getSize());
//System.out.println("S1 length " + str1.length() + ", S2 length " + str2.length());
System.out.println(new Date());
}
}