/* Copyright 2013 University of North Carolina at Chapel Hill. All rights reserved. */
package abra.utils.trio;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import abra.Feature;
import abra.RegionLoader;
import abra.utils.trio.LocusGenotype.Genotype;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMRecord;
/**
* Reader class for SAM or BAM file containing paired reads.
* Provides the ability to iterate over all mated pairs in the file.
* Supports multiple mappings for the same read. (i.e. repeating read id's)
* SAM versus BAM format is determined from the file suffix. i.e. ".sam" or ".bam"
*
* @author Lisle Mose (lmose at unc dot edu)
*/
public class TrioVcfReader implements Iterable<TrioGenotype> {
private LocusGenotype cachedFather;
private LocusGenotype cachedMother;
private LocusGenotype cachedChild;
private SAMRecord lastRead;
private int lineCnt = 0;
private BufferedReader father;
private BufferedReader mother;
private BufferedReader child;
private Map<String, Long> chromosomeOrder;
static final String NO_CHROMOSOME = "";
private Feature currentRegion;
private Iterator<Feature> regionIter;
public enum Caller { FREEBAYES, GATK, ISAAC };
private Caller caller;
public TrioVcfReader(String chromosomeFile, String father, String mother, String child, Caller caller) throws IOException {
this.caller = caller;
// GtfLoader loader = new GtfLoader();
// List<Feature> regions = loader.load(regionsGtf);
//
// regionIter = regions.iterator();
// if (regionIter.hasNext()) {
// currentRegion = regionIter.next();
// }
initChromosomeOrder(chromosomeFile);
//GZIPInputStream fatherGzip = new GZIPInputStream(new FileInputStream(father));
//GZIPInputStream motherGzip = new GZIPInputStream(new FileInputStream(mother));
//GZIPInputStream childGzip = new GZIPInputStream(new FileInputStream(child));
//this.father = new BufferedReader(new InputStreamReader(fatherGzip));
// this.mother = new BufferedReader(new InputStreamReader(motherGzip));
// this.child = new BufferedReader(new InputStreamReader(childGzip));
// String l = this.father.readLine();
// while (l != null) {
// System.out.println(l);
// l = this.father.readLine();
// }
this.father = new BufferedReader(new FileReader(father));
this.mother = new BufferedReader(new FileReader(mother));
this.child = new BufferedReader(new FileReader(child));
}
private void initChromosomeOrder(String chromosomeFile) throws IOException {
chromosomeOrder = new HashMap<String, Long>();
BufferedReader reader = new BufferedReader(new FileReader(chromosomeFile));
long order = 0;
String chromosome = reader.readLine();
while (chromosome != null) {
chromosome = chromosome.trim();
chromosomeOrder.put(chromosome, order);
order += 1000000000;
chromosome = reader.readLine();
}
chromosomeOrder.put(NO_CHROMOSOME, order);
reader.close();
}
private long getChromosomeVal(String chr) {
return chromosomeOrder.get(chr);
}
public void close() throws IOException {
father.close();
mother.close();
child.close();
}
private LocusGenotype parseLine(String line) {
LocusGenotype lgt = null;
if (caller == Caller.FREEBAYES) {
lgt = parseFreebayesLine(line);
} else if (caller == Caller.GATK) {
lgt = parseGatkLine(line);
} else if (caller == Caller.ISAAC) {
lgt = parseIsaacLine(line);
} else {
throw new IllegalArgumentException("Invalid caller specified...");
}
return lgt;
}
private LocusGenotype parseIsaacLine(String line) {
// if (line == null) {
// return LocusGenotype.REFERENCE;
// }
try {
String[] fields = line.split("\\s+");
String chr = fields[0];
int pos = Integer.parseInt(fields[1]);
int end = pos;
String ref = fields[3];
String[] alt = fields[4].split(",");
double qual = Double.parseDouble(fields[5]);
// String filter = fields[6];
// String info = fields[7];
// int endIdx = info.indexOf("END=");
// if (endIdx >= 0) {
// int endStopIdx = info.indexOf(";");
// end = Integer.parseInt(info.substring(endIdx+4, endStopIdx));
// }
Genotype gt = null;
double altAlleleFreq = 0.0;
String gtString = fields[9].split(":")[0];
if (gtString.equals("0/0")) {
gt = Genotype.REF_REF;
} else if (gtString.equals("0/1") || gtString.equals("1/0")) {
gt = Genotype.REF_ALT1;
} else if (gtString.equals("1/1")) {
gt = Genotype.ALT1_ALT1;
} else if (gtString.equals("1/2")) {
gt = Genotype.ALT1_ALT2;
} else {
gt = Genotype.UNK;
}
if ((gt == Genotype.REF_ALT1) && ((ref.length() > 1 && alt[0].length() == 1) || (ref.length() == 1 && alt[0].length() > 1))) {
//String ad = fields[9].split(":")[1];
int adIdx = -1;
if (fields[8].contains("DPI")) {
adIdx = 4;
} else {
adIdx = 5;
}
String ad = fields[9].split(":")[adIdx];
double ad1 = Integer.parseInt(ad.split(",")[0]);
double ad2 = Integer.parseInt(ad.split(",")[1]);
// double ad2 = Integer.parseInt(fields[9].split(":")[4]);
altAlleleFreq = ad2/(ad1+ad2);
// if (altAlleleFreq < .2) {
// return LocusGenotype.REFERENCE;
// }
} else if (gt == Genotype.ALT1_ALT1) {
altAlleleFreq = 1.0;
}
return new LocusGenotype(chr, pos, end, ref, alt[0], alt.length > 1 ? alt[1] : null, gt, qual, altAlleleFreq);
} catch (NumberFormatException e) {
e.printStackTrace();
throw new RuntimeException("Error processing: " + line);
}
}
private LocusGenotype parseGatkLine(String line) {
// if (line == null) {
// return LocusGenotype.REFERENCE;
// }
try {
String[] fields = line.split("\\s+");
String chr = fields[0];
int pos = Integer.parseInt(fields[1]);
int end = pos;
String ref = fields[3];
String[] alt = fields[4].split(",");
double qual = Double.parseDouble(fields[5]);
// String filter = fields[6];
// String info = fields[7];
// int endIdx = info.indexOf("END=");
// if (endIdx >= 0) {
// int endStopIdx = info.indexOf(";");
// end = Integer.parseInt(info.substring(endIdx+4, endStopIdx));
// }
Genotype gt = null;
double altAlleleFreq = 0.0;
String gtString = fields[9].split(":")[0];
if (gtString.equals("0/0")) {
gt = Genotype.REF_REF;
} else if (gtString.equals("0/1")) {
gt = Genotype.REF_ALT1;
} else if (gtString.equals("1/1")) {
gt = Genotype.ALT1_ALT1;
} else if (gtString.equals("1/2")) {
gt = Genotype.ALT1_ALT2;
} else {
gt = Genotype.UNK;
}
if ((gt == Genotype.REF_ALT1) && ((ref.length() > 1 && alt[0].length() == 1) || (ref.length() == 1 && alt[0].length() > 1))) {
String ad = fields[9].split(":")[1];
double depth = Integer.parseInt(fields[9].split(":")[2]);
double ad1 = Integer.parseInt(ad.split(",")[0]);
double ad2 = Integer.parseInt(ad.split(",")[1]);
// double ad2 = Integer.parseInt(fields[9].split(":")[4]);
altAlleleFreq = ad2/(ad1+ad2);
// if (altAlleleFreq < .2) {
// return LocusGenotype.REFERENCE;
// }
} else if (gt == Genotype.ALT1_ALT1) {
altAlleleFreq = 1.0;
}
return new LocusGenotype(chr, pos, end, ref, alt[0], alt.length > 1 ? alt[1] : null, gt, qual, altAlleleFreq);
} catch (NumberFormatException e) {
e.printStackTrace();
throw new RuntimeException("Error processing: " + line);
}
}
private LocusGenotype parseFreebayesLine(String line) {
// if (line == null) {
// return LocusGenotype.REFERENCE;
// }
try {
String[] fields = line.split("\\s+");
String chr = fields[0];
int pos = Integer.parseInt(fields[1]);
int end = pos;
String ref = fields[3];
String[] alt = fields[4].split(",");
double qual = Double.parseDouble(fields[5]);
// String filter = fields[6];
// String info = fields[7];
// int endIdx = info.indexOf("END=");
// if (endIdx >= 0) {
// int endStopIdx = info.indexOf(";");
// end = Integer.parseInt(info.substring(endIdx+4, endStopIdx));
// }
Genotype gt = null;
double altAlleleFreq = 0.0;
if (line.contains("APRIM")) {
String gtString = fields[9];
if (gtString.equals("0|0")) {
gt = Genotype.REF_REF;
} else if (gtString.equals("0|1")) {
gt = Genotype.REF_ALT1;
} else if (gtString.equals("1|0")) {
gt = Genotype.REF_ALT1;
} else if (gtString.equals("1|1")) {
gt = Genotype.ALT1_ALT1;
} else if (gtString.equals("1|2")) {
gt = Genotype.ALT1_ALT2;
} else if (gtString.equals("2|1")) {
gt = Genotype.ALT1_ALT2;
} else {
gt = Genotype.UNK;
}
altAlleleFreq = 1.0;
} else {
String gtString = fields[9].split(":")[0];
if (gtString.equals("0/0")) {
gt = Genotype.REF_REF;
} else if (gtString.equals("0/1") || gtString.equals("1/0")) {
gt = Genotype.REF_ALT1;
} else if (gtString.equals("1/1")) {
gt = Genotype.ALT1_ALT1;
} else if (gtString.equals("1/2")) {
gt = Genotype.ALT1_ALT2;
} else {
gt = Genotype.UNK;
}
// if ((gt == Genotype.REF_ALT1) && (fields[8].contains("DPI"))) {
if ((gt == Genotype.REF_ALT1) && ((ref.length() > 1 && alt[0].length() == 1) || (ref.length() == 1 && alt[0].length() > 1))) {
// String ad = fields[9].split(":")[4];
// double depth = Integer.parseInt(fields[9].split(":")[1]);
double ad1 = Integer.parseInt(fields[9].split(":")[3]);
double ad2 = Integer.parseInt(fields[9].split(":")[5]);
// String[] alleleDepth = ad.split(",");
// double ad1 = Integer.parseInt(alleleDepth[0]);
// double ad2 = Integer.parseInt(alleleDepth[1]);
altAlleleFreq = ad2/(ad1+ad2);
// if (altAlleleFreq < .2) {
// return LocusGenotype.REFERENCE;
// }
// altAlleleFreq = ad2/(depth);
// if (altAlleleFreq < .2) {
// gt = Genotype.REF_REF;
// }
} else if (gt == Genotype.ALT1_ALT1) {
altAlleleFreq = 1.0;
}
}
return new LocusGenotype(chr, pos, end, ref, alt[0], alt.length > 1 ? alt[1] : null, gt, qual, altAlleleFreq);
} catch (NumberFormatException e) {
e.printStackTrace();
throw new RuntimeException("Error processing: " + line);
}
}
private TrioGenotype getNextTrio() {
LocusGenotype father = null;
LocusGenotype mother = null;
LocusGenotype child = null;
try {
String fatherLine = null;
String motherLine = null;
String childLine = null;
if (cachedFather != null) {
father = cachedFather;
cachedFather = null;
} else {
fatherLine = this.father.readLine();
while (fatherLine != null && fatherLine.startsWith("#")) {
fatherLine = this.father.readLine();
}
}
if (cachedMother != null) {
mother = cachedMother;
cachedMother = null;
} else {
motherLine = this.mother.readLine();
while(motherLine != null && motherLine.startsWith("#")) {
motherLine = this.mother.readLine();
}
}
if (cachedChild != null) {
child = cachedChild;
cachedChild = null;
} else {
childLine = this.child.readLine();
while(childLine != null && childLine.startsWith("#")) {
childLine = this.child.readLine();
}
}
if (fatherLine == null && motherLine == null && childLine == null) {
// We're at the end of all 3 vcfs.
return null;
}
if (fatherLine != null) {
father = parseLine(fatherLine);
}
if (motherLine != null) {
mother = parseLine(motherLine);
}
if (childLine != null) {
child = parseLine(childLine);
}
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
// if (father.getPos() == 9996638) {
// System.out.println("here");
// }
// if (father.getPos() == 47555134) {
// System.out.println("here");
// }
List<Long> ends = new ArrayList<Long>();
if (father != null) {
ends.add(getEnd(father));
}
if (mother != null) {
ends.add(getEnd(mother));
}
if (child != null) {
ends.add(getEnd(child));
}
long minEnd = Long.MAX_VALUE;
for (long end : ends) {
if (end < minEnd) {
minEnd = end;
}
}
if (child != null && getEnd(child) > minEnd) {
cachedChild = child;
child = null;
}
if (mother != null && getEnd(mother) > minEnd) {
cachedMother = mother;
mother = null;
}
if (father != null && getEnd(father) > minEnd) {
cachedFather = father;
father = null;
}
/*
String minChromosome = father.getChromosome();
if (chromosomeOrder.get(mother.getChromosome()) < chromosomeOrder.get(minChromosome)) {
minChromosome = mother.getChromosome();
}
if (chromosomeOrder.get(child.getChromosome()) < chromosomeOrder.get(minChromosome)) {
minChromosome = child.getChromosome();
}
int minPos = Integer.MAX_VALUE;
if (father.getChromosome().equals(minChromosome)) {
if (father.getPos() < minPos) {
minPos = father.getPos();
}
}
if (mother.getChromosome().equals(minChromosome)) {
if (mother.getPos() < minPos) {
minPos = mother.getPos();
}
}
if (child.getChromosome().equals(minChromosome)) {
if (child.getPos() < minPos) {
minPos = child.getPos();
}
}
if (!father.getChromosome().equals(minChromosome) || father.getPos() != minPos) {
if (father != LocusGenotype.REFERENCE) {
cachedFather = father;
father = LocusGenotype.REFERENCE;
}
}
if (!mother.getChromosome().equals(minChromosome) || mother.getPos() != minPos) {
if (mother != LocusGenotype.REFERENCE) {
cachedMother = mother;
mother = LocusGenotype.REFERENCE;
}
}
if (!child.getChromosome().equals(minChromosome) || child.getPos() != minPos) {
if (child != LocusGenotype.REFERENCE) {
cachedChild = child;
child = LocusGenotype.REFERENCE;
}
}
*/
if (father == null) {
father = LocusGenotype.REFERENCE;
}
if (mother == null) {
mother = LocusGenotype.REFERENCE;
}
if (child == null) {
child = LocusGenotype.REFERENCE;
}
return new TrioGenotype(father, mother, child);
}
private long getEnd(LocusGenotype lg) {
return getChromosomeVal(lg.getChromosome()) + lg.getEnd();
}
public Iterator<TrioGenotype> iterator() {
return new TrioGenotypeIterator(this);
}
private boolean isInRegion(SAMFileHeader header, String chromosome, int start, int stop) {
// boolean isInRegion = currentRegion.overlapsRead(read);
while ((currentRegion != null) &&
(!currentRegion.overlaps(chromosome, start, stop)) &&
(!isRegionBeyondRead(currentRegion, chromosome, start, stop))) {
if (regionIter.hasNext()) {
currentRegion = (Feature) regionIter.next();
} else {
currentRegion = null;
}
}
return (currentRegion != null) && (currentRegion.overlaps(chromosome, start, stop));
}
private boolean isRegionBeyondRead(Feature region, String chromosome, int start, int stop) {
boolean isRegionBeyond = false;
// if (regionChrIdx > readChrIdx) {
// isRegionBeyond = true;
// } else if (regionChrIdx < readChrIdx) {
// isRegionBeyond = false;
// } else {
// isRegionBeyond = (region.getStart() > read.getAlignmentEnd());
// }
isRegionBeyond = (region.getStart() > stop);
return isRegionBeyond;
}
private static class TrioGenotypeIterator implements Iterator<TrioGenotype> {
private TrioVcfReader reader;
private TrioGenotype nextTrio = null;
TrioGenotypeIterator(TrioVcfReader reader) {
this.reader = reader;
}
@Override
public boolean hasNext() {
if (nextTrio == null) {
nextTrio = reader.getNextTrio();
}
return nextTrio != null;
}
@Override
public TrioGenotype next() {
TrioGenotype trio = nextTrio;
nextTrio = null;
return trio;
}
@Override
public void remove() {
throw new UnsupportedOperationException("Remove not supported for TrioGenotypeIterator.");
}
}
/*
public static void main(String[] args) throws Exception {
String chromosomes = "/home/lmose/dev/abra/trio/chromosomes.txt";
String father = "/home/lmose/dev/abra/trio/father.vcf";
String mother = "/home/lmose/dev/abra/trio/mother.vcf";
String child = "/home/lmose/dev/abra/trio/child.vcf";
TrioVcfReader rdr = new TrioVcfReader(chromosomes, father, mother, child);
int l = 0;
for (TrioGenotype gt : rdr) {
if (gt.hasVariant()) {
System.out.println(gt.summary());
}
// if (l++ > 500) break;
}
}
*/
}