if (!REMOVE_ALIGNMENT_INFORMATION) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, presorted, OUTPUT);
////////////////////////////////////////////////////////////////////////////
// Build a sorting collection to use if we are sanitizing
////////////////////////////////////////////////////////////////////////////
final SortingCollection<SAMRecord> sorter;
if (sanitizing) {
sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
}
else {
sorter = null;
}
final ProgressLogger progress = new ProgressLogger(log, 1000000, "Reverted");
for (final SAMRecord rec : in) {
// Weed out non-primary and supplemental read as we don't want duplicates in the reverted file!
if (rec.isSecondaryOrSupplementary()) continue;
// Actually to the reverting of the remaining records
revertSamRecord(rec);
if (sanitizing) sorter.add(rec);
else out.addAlignment(rec);
progress.record(rec);
}
////////////////////////////////////////////////////////////////////////////
// Now if we're sanitizing, clean up the records and write them to the output
////////////////////////////////////////////////////////////////////////////
if (!sanitizing) {
out.close();
}
else {
long total = 0, discarded = 0;
final PeekableIterator<SAMRecord> iterator = new PeekableIterator<SAMRecord>(sorter.iterator());
final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<SAMReadGroupRecord, FastqQualityFormat>();
// Figure out the quality score encoding scheme for each read group.
for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) {
final SamReader reader = SamReaderFactory.makeDefault().validationStringency(VALIDATION_STRINGENCY).open(INPUT);
final SamRecordFilter filter = new SamRecordFilter() {
public boolean filterOut(final SAMRecord rec) {
return !rec.getReadGroup().getId().equals(rg.getId());
}
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
throw new UnsupportedOperationException();
}
};
readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringIterator(reader.iterator(), filter), RESTORE_ORIGINAL_QUALITIES));
CloserUtil.close(reader);
}
for(final SAMReadGroupRecord r : readGroupToFormat.keySet()) {
log.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r));
}
if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) {
log.error("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa);
return -1;
}
final ProgressLogger sanitizerProgress = new ProgressLogger(log, 1000000, "Sanitized");
readNameLoop: while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();
// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
log.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}
// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
}
// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts=0, seconds=0, unpaired=0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag()) ++unpaired;
if (rec.getFirstOfPairFlag()) ++firsts;
if (rec.getSecondOfPairFlag()) ++seconds;
}
if (unpaired > 0 || firsts != 1 || seconds != 1) {
log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
discarded += recs.size();
continue readNameLoop;
}
}
// If we've made it this far spit the records into the output!
for (final SAMRecord rec : recs) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (!recordFormat.equals(FastqQualityFormat.Standard)) {
final byte quals[] = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.addAlignment(rec);
sanitizerProgress.record(rec);
}
}
out.close();
final double discardRate = discarded / (double) total;
final NumberFormat fmt = new DecimalFormat("0.000%");
log.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output.");