Package net.sf.cram

Source Code of net.sf.cram.Merge

package net.sf.cram;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.zip.GZIPInputStream;

import net.sf.cram.CramTools.LevelConverter;
import net.sf.cram.CramTools.ValidationStringencyConverter;
import net.sf.cram.index.BAMQueryFilteringIterator;
import net.sf.cram.index.CramIndex;
import net.sf.picard.io.IoUtil;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.util.Log;
import net.sf.picard.util.Log.LogLevel;
import net.sf.samtools.BAMFileWriter;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileHeader.SortOrder;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMIterator;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordIterator;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.SeekableFileStream;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;

public class Merge {

  public static void usage(JCommander jc) {
    StringBuilder sb = new StringBuilder();
    sb.append("\n");
    jc.usage(sb);

    System.out.println("Version "
        + Merge.class.getPackage().getImplementationVersion());
    System.out.println(sb.toString());
  }

  public static void main(String[] args) throws IOException {
    Params params = new Params();
    JCommander jc = new JCommander(params);
    jc.parse(args);

    Log.setGlobalLogLevel(params.logLevel);

    if (args.length == 0 || params.help) {
      usage(jc);
      System.exit(1);
    }

    if (params.reference == null) {
      System.out.println("A reference fasta file is required.");
      System.exit(1);
    }

    if (params.files == null || params.files.isEmpty()) {
      System.out.println("At least one CRAM or BAM file is required.");
      System.exit(1);
    }

    ReferenceSequenceFile refFile = null;
    if (params.reference != null) {
      System.setProperty("reference", params.reference.getAbsolutePath());
      refFile = ReferenceSequenceFileFactory
          .getReferenceSequenceFile(params.reference);
    } else {
      String prop = System.getProperty("reference");
      if (prop != null)
        refFile = ReferenceSequenceFileFactory
            .getReferenceSequenceFile(new File(prop));
    }

    AlignmentSliceQuery query = params.region == null ? null
        : new AlignmentSliceQuery(params.region);

    List<RecordSource> list = readFiles(params.files, refFile, query, params.validationLevel);

    StringBuffer mergeComment = new StringBuffer("Merged from:");
    for (RecordSource source : list) {
      mergeComment.append(" ").append(source.path);
    }

    resolveCollisions(list);
    SAMFileHeader header = mergeHeaders(list);
    header.addComment(mergeComment.toString());

    SAMFileWriter writer = null;
    if (params.outFile != null)
      if (!params.samFormat)
        writer = new SAMFileWriterFactory().makeBAMWriter(header, true,
            params.outFile);
      else
        writer = new SAMFileWriterFactory().makeSAMWriter(header, true,
            params.outFile);
    else if (!params.samFormat) {
      // hack to write BAM format to stdout:
      File file = File.createTempFile("bam", null);
      file.deleteOnExit();
      BAMFileWriter bamWriter = new BAMFileWriter(System.out, file);
      header.setSortOrder(SortOrder.coordinate);
      bamWriter.setHeader(header);
      writer = bamWriter;
    }

    else {
      writer = Utils.createSAMTextWriter(null, System.out, header,
          params.printSAMHeader);
    }

    MergedIterator mergedIterator = new MergedIterator(list, header);
    while (mergedIterator.hasNext()) {
      SAMRecord record = mergedIterator.next();
      writer.addAlignment(record);
    }

    mergedIterator.close();
    for (RecordSource source : list)
      source.close();

    // hack: BAMFileWriter may throw this when streaming to stdout, so
    // silently drop the exception if streaming out BAM format:
    try {
      writer.close();
    } catch (net.sf.samtools.util.RuntimeIOException e) {
      if (params.samFormat
          || params.outFile != null
          || !e.getMessage()
              .matches(
                  "Terminator block not found after closing BGZF file.*"))
        throw e;
    }
  }

  private static List<RecordSource> readFiles(List<File> files,
      ReferenceSequenceFile refFile, AlignmentSliceQuery query, ValidationStringency ValidationStringency)
      throws IOException {
    List<RecordSource> sources = new ArrayList<Merge.RecordSource>(
        files.size());

    SAMFileReader.setDefaultValidationStringency(ValidationStringency) ;
    for (File file : files) {
      IoUtil.assertFileIsReadable(file);

      RecordSource source = new RecordSource();
      sources.add(source);
      source.id = file.getName();
      source.path = file.getAbsolutePath();

      File index = new File(file.getAbsolutePath() + ".bai");
      if (index.exists()) {
        SAMFileReader reader = new SAMFileReader(file, index);
        source.reader = reader;
        if (query == null)
          source.it = reader.iterator();
        else
          source.it = reader.query(query.sequence, query.start,
              query.end, true);
      } else {
        index = new File(file.getAbsolutePath() + ".crai");
        if (index.exists()) {
          SAMFileReader reader = new SAMFileReader(file);
          source.reader = reader;
          if (query == null)
            source.it = reader.iterator();
          else {
            SeekableFileStream is = new SeekableFileStream(file);

            FileInputStream fis = new FileInputStream(index);
            GZIPInputStream gis = new GZIPInputStream(
                new BufferedInputStream(fis));
            BufferedInputStream bis = new BufferedInputStream(gis);
            List<CramIndex.Entry> full = CramIndex.readIndex(gis);

            List<CramIndex.Entry> entries = new LinkedList<CramIndex.Entry>();
            SAMSequenceRecord sequence = reader.getFileHeader()
                .getSequence(query.sequence);
            if (sequence == null)
              throw new RuntimeException("Sequence not found: "
                  + query.sequence);

            entries.addAll(CramIndex.find(full,
                sequence.getSequenceIndex(), query.start,
                query.end - query.start));

            bis.close();

            SAMIterator it = new SAMIterator(is, refFile);
            is.seek(entries.get(0).containerStartOffset);
            BAMQueryFilteringIterator bit = new BAMQueryFilteringIterator(
                it, query.sequence, query.start, query.end,
                BAMQueryFilteringIterator.QueryType.CONTAINED,
                reader.getFileHeader());
            source.it = bit;
          }
        } else {
          SAMFileReader reader = new SAMFileReader(file);
          source.reader = reader;
          source.it = reader.iterator();
        }
      }
    }

    return sources;
  }

  private static List<String> resolveCollisions(List<RecordSource> list) {

    ArrayList<String> result = new ArrayList<String>(list.size());

    // count list entries:
    Map<String, Integer> idCountMap = new TreeMap<String, Integer>();
    for (RecordSource source : list) {
      if (idCountMap.containsKey(source.id)) {
        idCountMap.put(source.id,
            ((Integer) idCountMap.get(source.id)).intValue() + 1);
      } else
        idCountMap.put(source.id, 1);
    }

    // update entries with their number of occurrence except for singletons:
    for (int i = list.size() - 1; i >= 0; i--) {
      RecordSource source = list.get(i);
      int count = idCountMap.get(source.id);
      if (count > 1) {
        list.get(i).id = source.id + String.valueOf(count);
        idCountMap.put(source.id, --count);
      }
    }

    return result;
  }

  private static class RecordSource {
    private String path;
    private String id;
    private CloseableIterator<SAMRecord> it;
    private SAMFileReader reader;

    public RecordSource() {
    }

    public RecordSource(String id, SAMRecordIterator it) {
      this.id = id;
      this.it = it;
    }

    public void close() {
      if (it != null)
        it.close();
      if (reader != null)
        reader.close();
    }

  }

  private static SAMFileHeader mergeHeaders(List<RecordSource> sources) {
    SAMFileHeader header = new SAMFileHeader();
    for (RecordSource source : sources) {
      SAMFileHeader h = source.reader.getFileHeader();

      for (SAMSequenceRecord seq : h.getSequenceDictionary()
          .getSequences()) {
        if (header.getSequenceDictionary().getSequence(
            seq.getSequenceName()) == null)
          header.addSequence(seq);
      }

      for (SAMProgramRecord pro : h.getProgramRecords()) {
        if (header.getProgramRecord(pro.getProgramGroupId()) == null)
          header.addProgramRecord(pro);
      }

      for (String comment : h.getComments())
        header.addComment(comment);

      for (SAMReadGroupRecord rg : h.getReadGroups()) {
        if (header.getReadGroup(rg.getReadGroupId()) == null)
          header.addReadGroup(rg);
      }

    }
    return header;
  }

  // private static class SAMQueue implements BlockingQueue<SAMRecord> {
  // }

  private static class MergedIterator implements SAMRecordIterator {
    private static String delim = ".";
    private RecordSource[] sources;
    private SAMRecord[] records;
    private SAMRecord next;
    private SAMFileHeader header;

    public MergedIterator(List<RecordSource> list, SAMFileHeader header) {
      this.header = header;
      sources = (RecordSource[]) list.toArray(new RecordSource[list
          .size()]);
      records = new SAMRecord[list.size()];

      for (int i = 0; i < records.length; i++) {
        if (sources[i].it.hasNext())
          records[i] = sources[i].it.next();
      }

      advance();
    }

    @Override
    public void close() {
      if (sources != null)
        for (RecordSource source : sources)
          if (source != null)
            source.close();

      records = null;
      next = null;
    }

    @Override
    public boolean hasNext() {
      return next != null;
    }

    private void advance() {
      int candidateIndex = getIndexOfMinAlignment();
      if (candidateIndex < 0) {
        next = null;
      } else {
        next = records[candidateIndex];
        SAMSequenceRecord sequence = header.getSequence(next
            .getReferenceName());

        next.setHeader(header);

        next.setReferenceIndex(sequence.getSequenceIndex());

        next.setReadName(sources[candidateIndex].id + delim
            + next.getReadName());

        if (next.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
          next.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
        } else {
          SAMSequenceRecord mateSequence = header.getSequence(next
              .getMateReferenceName());
          next.setMateReferenceIndex(mateSequence.getSequenceIndex());
        }

        if (sources[candidateIndex].it.hasNext())
          records[candidateIndex] = sources[candidateIndex].it.next();
        else
          records[candidateIndex] = null;
      }
    }

    @Override
    public SAMRecord next() {
      if (next == null)
        return null;

      SAMRecord result = next;
      advance();

      return result;
    }

    @Override
    public void remove() {
      throw new RuntimeException("Unsupported operation.");
    }

    @Override
    public SAMRecordIterator assertSorted(SortOrder sortOrder) {
      // TODO Auto-generated method stub
      return null;
    }

    private int getIndexOfMinAlignment() {
      if (records == null || records.length == 0)
        return -1;

      int min = Integer.MAX_VALUE;
      int index = -1;
      for (int i = 0; i < records.length; i++) {
        if (records[i] == null)
          continue;

        int start = records[i].getAlignmentStart();
        if (start > 0 && start < min) {
          min = start;
          index = i;
        }
      }
      return index;
    }

  }

  @Parameters(commandDescription = "Tool to merge CRAM or BAM files. ")
  static class Params {
    @Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = LevelConverter.class)
    LogLevel logLevel = LogLevel.ERROR;

    @Parameter(names = { "-v", "--validation-level" }, description = "Change validation stringency level: STRICT, LENIENT, SILENT.", converter = ValidationStringencyConverter.class)
    ValidationStringency validationLevel = ValidationStringency.DEFAULT_STRINGENCY;

    @Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example).")
    File reference;

    @Parameter(names = { "--output-file" }, converter = FileConverter.class, description = "Path to the output BAM file. Omit for stdout.")
    File outFile;

    @Parameter(names = { "--sam-format" }, description = "Output in SAM rather than BAM format.")
    boolean samFormat = false;

    @Parameter(names = { "--sam-header" }, description = "Print SAM file header when output format is text SAM.")
    boolean printSAMHeader = false;

    @Parameter(names = { "--region", "-r" }, description = "Alignment slice specification, for example: chr1:65000-100000.")
    String region;

    @Parameter(names = { "-h", "--help" }, description = "Print help and quit")
    boolean help = false;

    @Parameter(converter = FileConverter.class, description = "The paths to the CRAM or BAM files to uncompress. ")
    List<File> files;

  }

}
TOP

Related Classes of net.sf.cram.Merge

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.