Package org.molgenis.util.plink.readers

Source Code of org.molgenis.util.plink.readers.BedBimFamReader

package org.molgenis.util.plink.readers;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.molgenis.util.plink.datatypes.Biallele;
import org.molgenis.util.plink.datatypes.BimEntry;
import org.molgenis.util.plink.datatypes.FamEntry;
import org.molgenis.util.plink.drivers.BedFileDriver;
import org.molgenis.util.plink.drivers.BimFileDriver;
import org.molgenis.util.plink.drivers.FamFileDriver;

/**
* Plink binary reader/converter. See:
* https://github.com/molgenis/molgenis/blob/standalone_tools
* /src/plinkbintocsv/PlinkbinToCsv.java
*
* @author joeri
*
*/
public class BedBimFamReader
{

  private BedFileDriver bedfd;
  private BimFileDriver bimfd;
  private FamFileDriver famfd;

  private long nrOfIndividuals;
  private long nrOfSnps;
  private long nrOfGenotypes;
  private int paddingPerSnp;
  private List<String> individualNames;
  private List<String> snpNames;
  private HashMap<String, Biallele> snpCoding;

  public BedBimFamReader(File bed, File bim, File fam) throws Exception
  {
    bedfd = new BedFileDriver(bed);
    bimfd = new BimFileDriver(bim);
    famfd = new FamFileDriver(fam);

    nrOfIndividuals = famfd.getNrOfElements();
    nrOfSnps = bimfd.getNrOfElements();
    nrOfGenotypes = nrOfIndividuals * nrOfSnps;
    paddingPerSnp = (int) ((bedfd.getNrOfElements() - nrOfGenotypes) / nrOfSnps);
  }

  private void setIndividuals() throws Exception
  {
    List<String> individualNames = new ArrayList<String>();
    List<FamEntry> famEntries = famfd.getAllEntries();
    if (famEntries.size() != nrOfIndividuals)
    {
      throw new Exception("Problem with FAM file: scanned number of elements (" + nrOfIndividuals
          + ") does not match number of parsed elements (" + famEntries.size() + ")");
    }
    for (FamEntry fe : famEntries)
    {
      if (individualNames.contains(fe.getIndividual()))
      {
        throw new Exception("Problem with FAM file: Individual '" + fe.getIndividual() + "' is not unique!");
      }
      individualNames.add(fe.getIndividual());
    }
    this.individualNames = individualNames;
  }

  private void setSnps() throws Exception
  {
    nrOfSnps = bimfd.getNrOfElements();
    List<String> snpNames = new ArrayList<String>();
    List<BimEntry> bimEntries = bimfd.getAllEntries();
    if (bimEntries.size() != nrOfSnps)
    {
      throw new Exception(
          "Problem with BIM file: scanned number of elements does not match number of parsed elements");
    }
    snpCoding = new HashMap<String, Biallele>();
    for (BimEntry be : bimEntries)
    {
      if (snpCoding.containsKey(be.getSNP()))
      {
        throw new Exception("Problem with BIM file: SNP '" + be.getSNP() + "' is not unique!");
      }
      snpCoding.put(be.getSNP(), be.getBiallele());
      snpNames.add(be.getSNP());
    }
    this.snpNames = snpNames;
  }

  public void extractGenotypes(File writeTo) throws Exception
  {
    setIndividuals();
    setSnps();

    Writer genotypesOut = new OutputStreamWriter(new FileOutputStream(writeTo), "UTF-8");

    // /header: all individual names
    for (String indvName : individualNames)
    {
      genotypesOut.write("\t" + indvName);
    }
    genotypesOut.write("\n");

    // elements: snp name + genotypes
    int snpCounter = 0;
    for (int genotypeStart = 0; genotypeStart < nrOfGenotypes; genotypeStart += nrOfIndividuals)
    {
      System.out.println((int) ((genotypeStart / (double) nrOfGenotypes) * 100) + "% of genotypes done");

      String snpName = snpNames.get(snpCounter);
      String[] allIndividualsForThisSNP = bedfd.getElements(genotypeStart, genotypeStart + nrOfIndividuals,
          paddingPerSnp, snpCounter);
      String a1 = Character.toString(snpCoding.get(snpName).getAllele1());
      String a2 = Character.toString(snpCoding.get(snpName).getAllele2());
      String hom1 = a1 + a1;
      String hom2 = a2 + a2;
      String hetr = a1 + a2;

      StringBuilder lineOfGenotypesBuilder = new StringBuilder(snpName);
      for (String s : allIndividualsForThisSNP)
      {
        lineOfGenotypesBuilder.append('\t').append(bedfd.convertGenoCoding(s, hom1, hom2, hetr, ""));
      }
      lineOfGenotypesBuilder.append('\n');
      genotypesOut.write(lineOfGenotypesBuilder.toString());

      snpCounter++;
    }

    genotypesOut.close();

  }

  public static void main(String[] args) throws Exception
  {
    File bed = new File(Biallele.class.getResource("../testfiles/test.bed").getFile());

    File bim = new File(Biallele.class.getResource("../testfiles/test.bim").getFile());

    File fam = new File(Biallele.class.getResource("../testfiles/test.fam").getFile());

    BedBimFamReader bbfr = new BedBimFamReader(bed, bim, fam);

    File out = new File("geno_tmp.txt");

    System.out.println("going to write to: " + out.getAbsolutePath());

    bbfr.extractGenotypes(out);
  }
}
TOP

Related Classes of org.molgenis.util.plink.readers.BedBimFamReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.