Package abra.utils.trio

Source Code of abra.utils.trio.TrioVcfReader

/* Copyright 2013 University of North Carolina at Chapel Hill.  All rights reserved. */
package abra.utils.trio;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import abra.Feature;
import abra.RegionLoader;
import abra.utils.trio.LocusGenotype.Genotype;

import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMRecord;

/**
* Reader class for SAM or BAM file containing paired reads.
* Provides the ability to iterate over all mated pairs in the file.
* Supports multiple mappings for the same read. (i.e. repeating read id's)
* SAM versus BAM format is determined from the file suffix.  i.e. ".sam" or ".bam"
*
* @author Lisle Mose (lmose at unc dot edu)
*/
public class TrioVcfReader implements Iterable<TrioGenotype> {
   
    private LocusGenotype cachedFather;
    private LocusGenotype cachedMother;
    private LocusGenotype cachedChild;
   
   
    private SAMRecord lastRead;
    private int lineCnt = 0;
   
    private BufferedReader father;
    private BufferedReader mother;
    private BufferedReader child;
   
    private Map<String, Long> chromosomeOrder;
   
    static final String NO_CHROMOSOME = "";
   
  private Feature currentRegion;
  private Iterator<Feature> regionIter;
 
  public enum Caller { FREEBAYES, GATK, ISAAC };
 
  private Caller caller;
   
    public TrioVcfReader(String chromosomeFile, String father, String mother, String child, Caller caller) throws IOException {
     
      this.caller = caller;
//      GtfLoader loader = new GtfLoader();
//    List<Feature> regions = loader.load(regionsGtf);
//     
//        regionIter = regions.iterator();
//        if (regionIter.hasNext()) {
//          currentRegion = regionIter.next();
//        }
     
      initChromosomeOrder(chromosomeFile);
     
    //GZIPInputStream fatherGzip = new GZIPInputStream(new FileInputStream(father));
    //GZIPInputStream motherGzip = new GZIPInputStream(new FileInputStream(mother));
    //GZIPInputStream childGzip = new GZIPInputStream(new FileInputStream(child));
     
    //this.father = new BufferedReader(new InputStreamReader(fatherGzip));
//    this.mother = new BufferedReader(new InputStreamReader(motherGzip));
//    this.child = new BufferedReader(new InputStreamReader(childGzip));
   
   
   
//    String l = this.father.readLine();
//    while (l != null) {
//      System.out.println(l);
//      l = this.father.readLine();
//    }
   
      this.father = new BufferedReader(new FileReader(father));
      this.mother = new BufferedReader(new FileReader(mother));
      this.child = new BufferedReader(new FileReader(child));
    }
   
    private void initChromosomeOrder(String chromosomeFile) throws IOException {
     
      chromosomeOrder = new HashMap<String, Long>();
     
      BufferedReader reader = new BufferedReader(new FileReader(chromosomeFile));

      long order = 0;
      String chromosome = reader.readLine();
     
      while (chromosome != null) {
        chromosome = chromosome.trim();
        chromosomeOrder.put(chromosome, order);
        order += 1000000000;
        chromosome = reader.readLine();
      }
     
      chromosomeOrder.put(NO_CHROMOSOME, order);
     
      reader.close();
    }
   
    private long getChromosomeVal(String chr) {
      return chromosomeOrder.get(chr);
    }
   
    public void close() throws IOException {
        father.close();
        mother.close();
        child.close();
    }
   
    private LocusGenotype parseLine(String line) {
      LocusGenotype lgt = null;
     
      if (caller == Caller.FREEBAYES) {
        lgt = parseFreebayesLine(line);
      } else if (caller == Caller.GATK) {
        lgt = parseGatkLine(line);
      } else if (caller == Caller.ISAAC) {
        lgt = parseIsaacLine(line);
      } else {
        throw new IllegalArgumentException("Invalid caller specified...");
      }
     
      return lgt;
    }
   
    private LocusGenotype parseIsaacLine(String line) {
//      if (line == null) {
//        return LocusGenotype.REFERENCE;
//      }
      try {
        String[] fields = line.split("\\s+");
       
        String chr = fields[0];
        int pos = Integer.parseInt(fields[1]);
        int end = pos;
       
        String ref = fields[3];
        String[] alt = fields[4].split(",");
        double qual = Double.parseDouble(fields[5]);
//        String filter = fields[6];
//        String info = fields[7];
       
//        int endIdx = info.indexOf("END=");
//        if (endIdx >= 0) {
//          int endStopIdx = info.indexOf(";");
//          end = Integer.parseInt(info.substring(endIdx+4, endStopIdx));
//        }
       
        Genotype gt = null;
        double altAlleleFreq = 0.0;
       
       
        String gtString = fields[9].split(":")[0];
       
        if (gtString.equals("0/0")) {
        gt = Genotype.REF_REF;
      } else if (gtString.equals("0/1") || gtString.equals("1/0")) {
        gt = Genotype.REF_ALT1;
      } else if (gtString.equals("1/1")) {
        gt = Genotype.ALT1_ALT1;
      } else if (gtString.equals("1/2")) {
        gt = Genotype.ALT1_ALT2;
      } else {     
        gt = Genotype.UNK;
      }
       
        if ((gt == Genotype.REF_ALT1) && ((ref.length() > 1 && alt[0].length() == 1) || (ref.length() == 1 && alt[0].length() > 1))) {
          //String ad = fields[9].split(":")[1];
          int adIdx = -1;
          if (fields[8].contains("DPI")) {
            adIdx = 4;
          } else {
            adIdx = 5;
          }
          String ad = fields[9].split(":")[adIdx];
          double ad1 = Integer.parseInt(ad.split(",")[0]);
          double ad2 = Integer.parseInt(ad.split(",")[1]);
//          double ad2 = Integer.parseInt(fields[9].split(":")[4]);
          altAlleleFreq = ad2/(ad1+ad2);
         
//          if (altAlleleFreq < .2) {
//            return LocusGenotype.REFERENCE;
//          }
         
        } else if (gt == Genotype.ALT1_ALT1) {
          altAlleleFreq = 1.0;
        }
       
        return new LocusGenotype(chr, pos, end, ref, alt[0], alt.length > 1 ? alt[1] : null, gt, qual, altAlleleFreq);
      } catch (NumberFormatException e) {
        e.printStackTrace();
        throw new RuntimeException("Error processing: " + line);
      }
    }
   
    private LocusGenotype parseGatkLine(String line) {
//      if (line == null) {
//        return LocusGenotype.REFERENCE;
//      }
      try {
        String[] fields = line.split("\\s+");
       
        String chr = fields[0];
        int pos = Integer.parseInt(fields[1]);
        int end = pos;
       
        String ref = fields[3];
        String[] alt = fields[4].split(",");
        double qual = Double.parseDouble(fields[5]);
//        String filter = fields[6];
//        String info = fields[7];
       
//        int endIdx = info.indexOf("END=");
//        if (endIdx >= 0) {
//          int endStopIdx = info.indexOf(";");
//          end = Integer.parseInt(info.substring(endIdx+4, endStopIdx));
//        }
       
        Genotype gt = null;
        double altAlleleFreq = 0.0;
       
       
        String gtString = fields[9].split(":")[0];
       
        if (gtString.equals("0/0")) {
        gt = Genotype.REF_REF;
      } else if (gtString.equals("0/1")) {
        gt = Genotype.REF_ALT1;
      } else if (gtString.equals("1/1")) {
        gt = Genotype.ALT1_ALT1;
      } else if (gtString.equals("1/2")) {
        gt = Genotype.ALT1_ALT2;
      } else {     
        gt = Genotype.UNK;
      }
       
        if ((gt == Genotype.REF_ALT1) && ((ref.length() > 1 && alt[0].length() == 1) || (ref.length() == 1 && alt[0].length() > 1))) {
          String ad = fields[9].split(":")[1];
          double depth = Integer.parseInt(fields[9].split(":")[2]);
          double ad1 = Integer.parseInt(ad.split(",")[0]);
          double ad2 = Integer.parseInt(ad.split(",")[1]);
//          double ad2 = Integer.parseInt(fields[9].split(":")[4]);
          altAlleleFreq = ad2/(ad1+ad2);
         
//          if (altAlleleFreq < .2) {
//            return LocusGenotype.REFERENCE;
//          }
         
        } else if (gt == Genotype.ALT1_ALT1) {
          altAlleleFreq = 1.0;
        }
       
        return new LocusGenotype(chr, pos, end, ref, alt[0], alt.length > 1 ? alt[1] : null, gt, qual, altAlleleFreq);
      } catch (NumberFormatException e) {
        e.printStackTrace();
        throw new RuntimeException("Error processing: " + line);
      }
    }

   
    private LocusGenotype parseFreebayesLine(String line) {
//      if (line == null) {
//        return LocusGenotype.REFERENCE;
//      }
      try {
        String[] fields = line.split("\\s+");
       
        String chr = fields[0];
        int pos = Integer.parseInt(fields[1]);
        int end = pos;
       
        String ref = fields[3];
        String[] alt = fields[4].split(",");
        double qual = Double.parseDouble(fields[5]);
//        String filter = fields[6];
//        String info = fields[7];
       
//        int endIdx = info.indexOf("END=");
//        if (endIdx >= 0) {
//          int endStopIdx = info.indexOf(";");
//          end = Integer.parseInt(info.substring(endIdx+4, endStopIdx));
//        }
       
        Genotype gt = null;
        double altAlleleFreq = 0.0;
       
        if (line.contains("APRIM")) {
          String gtString = fields[9];
          if (gtString.equals("0|0")) {
          gt = Genotype.REF_REF;
        } else if (gtString.equals("0|1")) {
          gt = Genotype.REF_ALT1;
        } else if (gtString.equals("1|0")) {
          gt = Genotype.REF_ALT1;
        } else if (gtString.equals("1|1")) {
          gt = Genotype.ALT1_ALT1;
        } else if (gtString.equals("1|2")) {
          gt = Genotype.ALT1_ALT2;
        } else if (gtString.equals("2|1")) {
          gt = Genotype.ALT1_ALT2;
        } else {     
          gt = Genotype.UNK;
        }
         
          altAlleleFreq = 1.0;
        } else {
       
          String gtString = fields[9].split(":")[0];
         
         
         
          if (gtString.equals("0/0")) {
          gt = Genotype.REF_REF;
        } else if (gtString.equals("0/1") || gtString.equals("1/0")) {
          gt = Genotype.REF_ALT1;
        } else if (gtString.equals("1/1")) {
          gt = Genotype.ALT1_ALT1;
        } else if (gtString.equals("1/2")) {
          gt = Genotype.ALT1_ALT2;
        } else {     
          gt = Genotype.UNK;
        }
         
         
         
  //        if ((gt == Genotype.REF_ALT1) && (fields[8].contains("DPI"))) {
          if ((gt == Genotype.REF_ALT1) && ((ref.length() > 1 && alt[0].length() == 1) || (ref.length() == 1 && alt[0].length() > 1))) {
//            String ad = fields[9].split(":")[4];
//            double depth = Integer.parseInt(fields[9].split(":")[1]);
            double ad1 = Integer.parseInt(fields[9].split(":")[3]);
            double ad2 = Integer.parseInt(fields[9].split(":")[5]);
  //          String[] alleleDepth = ad.split(",");
  //          double ad1 = Integer.parseInt(alleleDepth[0]);
  //          double ad2 = Integer.parseInt(alleleDepth[1]);
            altAlleleFreq = ad2/(ad1+ad2);
           
//            if (altAlleleFreq < .2) {
//              return LocusGenotype.REFERENCE;
//            }

//            altAlleleFreq = ad2/(depth);
           
  //          if (altAlleleFreq < .2) {
  //            gt = Genotype.REF_REF;
  //          }
          } else if (gt == Genotype.ALT1_ALT1) {
            altAlleleFreq = 1.0;
          }
        }
       
        return new LocusGenotype(chr, pos, end, ref, alt[0], alt.length > 1 ? alt[1] : null, gt, qual, altAlleleFreq);
      } catch (NumberFormatException e) {
        e.printStackTrace();
        throw new RuntimeException("Error processing: " + line);
      }
    }
   
    private TrioGenotype getNextTrio()  {
      LocusGenotype father = null;
      LocusGenotype mother = null;
      LocusGenotype child  = null;
     
      try {
        String fatherLine = null;
        String motherLine = null;
        String childLine = null;
       
        if (cachedFather != null) {
          father = cachedFather;
          cachedFather = null;
        } else {
          fatherLine = this.father.readLine();
          while (fatherLine != null && fatherLine.startsWith("#")) {
            fatherLine = this.father.readLine();
          }
        }
       
        if (cachedMother != null) {
          mother = cachedMother;
          cachedMother = null;
        } else {
          motherLine = this.mother.readLine();
          while(motherLine != null && motherLine.startsWith("#")) {
            motherLine = this.mother.readLine();
          }
        }
       
        if (cachedChild != null) {
          child = cachedChild;
          cachedChild = null;
        } else {
          childLine = this.child.readLine();
          while(childLine != null && childLine.startsWith("#")) {
            childLine = this.child.readLine();
          }
        }
       
          if (fatherLine == null && motherLine == null && childLine == null) {
            // We're at the end of all 3 vcfs.
            return null;
          }
       
          if (fatherLine != null) {
            father = parseLine(fatherLine);
          }

          if (motherLine != null) {
            mother = parseLine(motherLine);
          }
         
          if (childLine != null) {
            child = parseLine(childLine);
          }

      } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
      }
     
//      if (father.getPos() == 9996638) {
//        System.out.println("here");
//      }
     

//      if (father.getPos() == 47555134) {
//        System.out.println("here");
//      }
     
      List<Long> ends = new ArrayList<Long>();
      if (father != null) {
        ends.add(getEnd(father));
      }
      if (mother != null) {
        ends.add(getEnd(mother));
      }
      if (child != null) {
        ends.add(getEnd(child));
      }
     
      long minEnd = Long.MAX_VALUE;
      for (long end : ends) {
        if (end < minEnd) {
          minEnd = end;
        }
      }
     
      if (child != null && getEnd(child) > minEnd) {
        cachedChild = child;
        child = null;
      }
     
      if (mother != null && getEnd(mother) > minEnd) {
        cachedMother = mother;
        mother = null;
      }
     
      if (father != null && getEnd(father) > minEnd) {
        cachedFather = father;
        father = null;
      }
     
      /*
      String minChromosome = father.getChromosome();
      if (chromosomeOrder.get(mother.getChromosome()) < chromosomeOrder.get(minChromosome)) {
        minChromosome = mother.getChromosome();
      }
      if (chromosomeOrder.get(child.getChromosome()) < chromosomeOrder.get(minChromosome)) {
        minChromosome = child.getChromosome();
      }
     
      int minPos = Integer.MAX_VALUE;
      if (father.getChromosome().equals(minChromosome)) {
        if (father.getPos() < minPos) {
          minPos = father.getPos();
        }
      }
     
      if (mother.getChromosome().equals(minChromosome)) {
        if (mother.getPos() < minPos) {
          minPos = mother.getPos();
        }
      }
     
      if (child.getChromosome().equals(minChromosome)) {
        if (child.getPos() < minPos) {
          minPos = child.getPos();
        }
      }
     
      if (!father.getChromosome().equals(minChromosome) || father.getPos() != minPos) {
        if (father != LocusGenotype.REFERENCE) {
          cachedFather = father;
          father = LocusGenotype.REFERENCE;
        }
      }
     
      if (!mother.getChromosome().equals(minChromosome) || mother.getPos() != minPos) {
        if (mother != LocusGenotype.REFERENCE) {
          cachedMother = mother;
          mother = LocusGenotype.REFERENCE;
        }
      }
     
      if (!child.getChromosome().equals(minChromosome) || child.getPos() != minPos) {
        if (child != LocusGenotype.REFERENCE) {
          cachedChild = child;
          child = LocusGenotype.REFERENCE;
        }
      }
      */
     
      if (father == null) {
        father = LocusGenotype.REFERENCE;
      }
     
      if (mother == null) {
        mother = LocusGenotype.REFERENCE;
      }
     
      if (child == null) {
        child = LocusGenotype.REFERENCE;
      }
     
      return new TrioGenotype(father, mother, child);
    }
  
    private long getEnd(LocusGenotype lg) {
      return getChromosomeVal(lg.getChromosome()) + lg.getEnd();
    }

    public Iterator<TrioGenotype> iterator() {
        return new TrioGenotypeIterator(this);
    }
  
  private boolean isInRegion(SAMFileHeader header, String chromosome, int start, int stop) {
//    boolean isInRegion = currentRegion.overlapsRead(read);
   
    while ((currentRegion != null) &&
         (!currentRegion.overlaps(chromosome, start, stop)) &&
         (!isRegionBeyondRead(currentRegion, chromosome, start, stop))) {
     
      if (regionIter.hasNext()) {
        currentRegion = (Feature) regionIter.next();
      } else {
        currentRegion = null;
      }
    }
   
    return (currentRegion != null) && (currentRegion.overlaps(chromosome, start, stop));
  }
 
  private boolean isRegionBeyondRead(Feature region, String chromosome, int start, int stop) {
    boolean isRegionBeyond = false;

   
//    if (regionChrIdx > readChrIdx) {
//      isRegionBeyond = true;
//    } else if (regionChrIdx < readChrIdx) {
//      isRegionBeyond = false;
//    } else {
//      isRegionBeyond = (region.getStart() > read.getAlignmentEnd());
//    }
   
    isRegionBeyond = (region.getStart() > stop);
   
    return isRegionBeyond;
  }
   
   
    private static class TrioGenotypeIterator implements Iterator<TrioGenotype> {

        private TrioVcfReader reader;
        private TrioGenotype nextTrio = null;
       
        TrioGenotypeIterator(TrioVcfReader reader) {
            this.reader = reader;
        }
       
        @Override
        public boolean hasNext() {
            if (nextTrio == null) {
              nextTrio = reader.getNextTrio();
            }
           
            return nextTrio != null;
        }

        @Override
        public TrioGenotype next() {
          TrioGenotype trio = nextTrio;
          nextTrio = null;
            return trio;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("Remove not supported for TrioGenotypeIterator.");
        }
    }

    /*
    public static void main(String[] args) throws Exception {
      String chromosomes = "/home/lmose/dev/abra/trio/chromosomes.txt";
      String father = "/home/lmose/dev/abra/trio/father.vcf";
      String mother = "/home/lmose/dev/abra/trio/mother.vcf";
      String child = "/home/lmose/dev/abra/trio/child.vcf";
     
     
      TrioVcfReader rdr = new TrioVcfReader(chromosomes, father, mother, child);
     
      int l = 0;
      for (TrioGenotype gt : rdr) {
        if (gt.hasVariant()) {
          System.out.println(gt.summary());
        }
       
//        if (l++ > 500) break;
      }
    }
    */
TOP

Related Classes of abra.utils.trio.TrioVcfReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.