package net.sf.cram;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.zip.GZIPInputStream;
import net.sf.cram.CramTools.LevelConverter;
import net.sf.cram.CramTools.ValidationStringencyConverter;
import net.sf.cram.index.BAMQueryFilteringIterator;
import net.sf.cram.index.CramIndex;
import net.sf.picard.io.IoUtil;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.util.Log;
import net.sf.picard.util.Log.LogLevel;
import net.sf.samtools.BAMFileWriter;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileHeader.SortOrder;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMIterator;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordIterator;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.SeekableFileStream;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;
public class Merge {
public static void usage(JCommander jc) {
StringBuilder sb = new StringBuilder();
sb.append("\n");
jc.usage(sb);
System.out.println("Version "
+ Merge.class.getPackage().getImplementationVersion());
System.out.println(sb.toString());
}
public static void main(String[] args) throws IOException {
Params params = new Params();
JCommander jc = new JCommander(params);
jc.parse(args);
Log.setGlobalLogLevel(params.logLevel);
if (args.length == 0 || params.help) {
usage(jc);
System.exit(1);
}
if (params.reference == null) {
System.out.println("A reference fasta file is required.");
System.exit(1);
}
if (params.files == null || params.files.isEmpty()) {
System.out.println("At least one CRAM or BAM file is required.");
System.exit(1);
}
ReferenceSequenceFile refFile = null;
if (params.reference != null) {
System.setProperty("reference", params.reference.getAbsolutePath());
refFile = ReferenceSequenceFileFactory
.getReferenceSequenceFile(params.reference);
} else {
String prop = System.getProperty("reference");
if (prop != null)
refFile = ReferenceSequenceFileFactory
.getReferenceSequenceFile(new File(prop));
}
AlignmentSliceQuery query = params.region == null ? null
: new AlignmentSliceQuery(params.region);
List<RecordSource> list = readFiles(params.files, refFile, query, params.validationLevel);
StringBuffer mergeComment = new StringBuffer("Merged from:");
for (RecordSource source : list) {
mergeComment.append(" ").append(source.path);
}
resolveCollisions(list);
SAMFileHeader header = mergeHeaders(list);
header.addComment(mergeComment.toString());
SAMFileWriter writer = null;
if (params.outFile != null)
if (!params.samFormat)
writer = new SAMFileWriterFactory().makeBAMWriter(header, true,
params.outFile);
else
writer = new SAMFileWriterFactory().makeSAMWriter(header, true,
params.outFile);
else if (!params.samFormat) {
// hack to write BAM format to stdout:
File file = File.createTempFile("bam", null);
file.deleteOnExit();
BAMFileWriter bamWriter = new BAMFileWriter(System.out, file);
header.setSortOrder(SortOrder.coordinate);
bamWriter.setHeader(header);
writer = bamWriter;
}
else {
writer = Utils.createSAMTextWriter(null, System.out, header,
params.printSAMHeader);
}
MergedIterator mergedIterator = new MergedIterator(list, header);
while (mergedIterator.hasNext()) {
SAMRecord record = mergedIterator.next();
writer.addAlignment(record);
}
mergedIterator.close();
for (RecordSource source : list)
source.close();
// hack: BAMFileWriter may throw this when streaming to stdout, so
// silently drop the exception if streaming out BAM format:
try {
writer.close();
} catch (net.sf.samtools.util.RuntimeIOException e) {
if (params.samFormat
|| params.outFile != null
|| !e.getMessage()
.matches(
"Terminator block not found after closing BGZF file.*"))
throw e;
}
}
private static List<RecordSource> readFiles(List<File> files,
ReferenceSequenceFile refFile, AlignmentSliceQuery query, ValidationStringency ValidationStringency)
throws IOException {
List<RecordSource> sources = new ArrayList<Merge.RecordSource>(
files.size());
SAMFileReader.setDefaultValidationStringency(ValidationStringency) ;
for (File file : files) {
IoUtil.assertFileIsReadable(file);
RecordSource source = new RecordSource();
sources.add(source);
source.id = file.getName();
source.path = file.getAbsolutePath();
File index = new File(file.getAbsolutePath() + ".bai");
if (index.exists()) {
SAMFileReader reader = new SAMFileReader(file, index);
source.reader = reader;
if (query == null)
source.it = reader.iterator();
else
source.it = reader.query(query.sequence, query.start,
query.end, true);
} else {
index = new File(file.getAbsolutePath() + ".crai");
if (index.exists()) {
SAMFileReader reader = new SAMFileReader(file);
source.reader = reader;
if (query == null)
source.it = reader.iterator();
else {
SeekableFileStream is = new SeekableFileStream(file);
FileInputStream fis = new FileInputStream(index);
GZIPInputStream gis = new GZIPInputStream(
new BufferedInputStream(fis));
BufferedInputStream bis = new BufferedInputStream(gis);
List<CramIndex.Entry> full = CramIndex.readIndex(gis);
List<CramIndex.Entry> entries = new LinkedList<CramIndex.Entry>();
SAMSequenceRecord sequence = reader.getFileHeader()
.getSequence(query.sequence);
if (sequence == null)
throw new RuntimeException("Sequence not found: "
+ query.sequence);
entries.addAll(CramIndex.find(full,
sequence.getSequenceIndex(), query.start,
query.end - query.start));
bis.close();
SAMIterator it = new SAMIterator(is, refFile);
is.seek(entries.get(0).containerStartOffset);
BAMQueryFilteringIterator bit = new BAMQueryFilteringIterator(
it, query.sequence, query.start, query.end,
BAMQueryFilteringIterator.QueryType.CONTAINED,
reader.getFileHeader());
source.it = bit;
}
} else {
SAMFileReader reader = new SAMFileReader(file);
source.reader = reader;
source.it = reader.iterator();
}
}
}
return sources;
}
private static List<String> resolveCollisions(List<RecordSource> list) {
ArrayList<String> result = new ArrayList<String>(list.size());
// count list entries:
Map<String, Integer> idCountMap = new TreeMap<String, Integer>();
for (RecordSource source : list) {
if (idCountMap.containsKey(source.id)) {
idCountMap.put(source.id,
((Integer) idCountMap.get(source.id)).intValue() + 1);
} else
idCountMap.put(source.id, 1);
}
// update entries with their number of occurrence except for singletons:
for (int i = list.size() - 1; i >= 0; i--) {
RecordSource source = list.get(i);
int count = idCountMap.get(source.id);
if (count > 1) {
list.get(i).id = source.id + String.valueOf(count);
idCountMap.put(source.id, --count);
}
}
return result;
}
private static class RecordSource {
private String path;
private String id;
private CloseableIterator<SAMRecord> it;
private SAMFileReader reader;
public RecordSource() {
}
public RecordSource(String id, SAMRecordIterator it) {
this.id = id;
this.it = it;
}
public void close() {
if (it != null)
it.close();
if (reader != null)
reader.close();
}
}
private static SAMFileHeader mergeHeaders(List<RecordSource> sources) {
SAMFileHeader header = new SAMFileHeader();
for (RecordSource source : sources) {
SAMFileHeader h = source.reader.getFileHeader();
for (SAMSequenceRecord seq : h.getSequenceDictionary()
.getSequences()) {
if (header.getSequenceDictionary().getSequence(
seq.getSequenceName()) == null)
header.addSequence(seq);
}
for (SAMProgramRecord pro : h.getProgramRecords()) {
if (header.getProgramRecord(pro.getProgramGroupId()) == null)
header.addProgramRecord(pro);
}
for (String comment : h.getComments())
header.addComment(comment);
for (SAMReadGroupRecord rg : h.getReadGroups()) {
if (header.getReadGroup(rg.getReadGroupId()) == null)
header.addReadGroup(rg);
}
}
return header;
}
// private static class SAMQueue implements BlockingQueue<SAMRecord> {
// }
private static class MergedIterator implements SAMRecordIterator {
private static String delim = ".";
private RecordSource[] sources;
private SAMRecord[] records;
private SAMRecord next;
private SAMFileHeader header;
public MergedIterator(List<RecordSource> list, SAMFileHeader header) {
this.header = header;
sources = (RecordSource[]) list.toArray(new RecordSource[list
.size()]);
records = new SAMRecord[list.size()];
for (int i = 0; i < records.length; i++) {
if (sources[i].it.hasNext())
records[i] = sources[i].it.next();
}
advance();
}
@Override
public void close() {
if (sources != null)
for (RecordSource source : sources)
if (source != null)
source.close();
records = null;
next = null;
}
@Override
public boolean hasNext() {
return next != null;
}
private void advance() {
int candidateIndex = getIndexOfMinAlignment();
if (candidateIndex < 0) {
next = null;
} else {
next = records[candidateIndex];
SAMSequenceRecord sequence = header.getSequence(next
.getReferenceName());
next.setHeader(header);
next.setReferenceIndex(sequence.getSequenceIndex());
next.setReadName(sources[candidateIndex].id + delim
+ next.getReadName());
if (next.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
next.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
} else {
SAMSequenceRecord mateSequence = header.getSequence(next
.getMateReferenceName());
next.setMateReferenceIndex(mateSequence.getSequenceIndex());
}
if (sources[candidateIndex].it.hasNext())
records[candidateIndex] = sources[candidateIndex].it.next();
else
records[candidateIndex] = null;
}
}
@Override
public SAMRecord next() {
if (next == null)
return null;
SAMRecord result = next;
advance();
return result;
}
@Override
public void remove() {
throw new RuntimeException("Unsupported operation.");
}
@Override
public SAMRecordIterator assertSorted(SortOrder sortOrder) {
// TODO Auto-generated method stub
return null;
}
private int getIndexOfMinAlignment() {
if (records == null || records.length == 0)
return -1;
int min = Integer.MAX_VALUE;
int index = -1;
for (int i = 0; i < records.length; i++) {
if (records[i] == null)
continue;
int start = records[i].getAlignmentStart();
if (start > 0 && start < min) {
min = start;
index = i;
}
}
return index;
}
}
@Parameters(commandDescription = "Tool to merge CRAM or BAM files. ")
static class Params {
@Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = LevelConverter.class)
LogLevel logLevel = LogLevel.ERROR;
@Parameter(names = { "-v", "--validation-level" }, description = "Change validation stringency level: STRICT, LENIENT, SILENT.", converter = ValidationStringencyConverter.class)
ValidationStringency validationLevel = ValidationStringency.DEFAULT_STRINGENCY;
@Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example).")
File reference;
@Parameter(names = { "--output-file" }, converter = FileConverter.class, description = "Path to the output BAM file. Omit for stdout.")
File outFile;
@Parameter(names = { "--sam-format" }, description = "Output in SAM rather than BAM format.")
boolean samFormat = false;
@Parameter(names = { "--sam-header" }, description = "Print SAM file header when output format is text SAM.")
boolean printSAMHeader = false;
@Parameter(names = { "--region", "-r" }, description = "Alignment slice specification, for example: chr1:65000-100000.")
String region;
@Parameter(names = { "-h", "--help" }, description = "Print help and quit")
boolean help = false;
@Parameter(converter = FileConverter.class, description = "The paths to the CRAM or BAM files to uncompress. ")
List<File> files;
}
}