final SamHeaderAndIterator headerAndIterator = openInputs();
final SAMFileHeader header = headerAndIterator.header;
final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP);
long index = 0;
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
if (null == this.libraryIdGenerator) {
this.libraryIdGenerator = new LibraryIdGenerator(header);
}
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
// This doesn't have anything to do with building sorted ReadEnd lists, but it can be done in the same pass
// over the input
if (PROGRAM_RECORD_ID != null) {
// Gather all PG IDs seen in merged input files in first pass. These are gathered for two reasons:
// - to know how many different PG records to create to represent this program invocation.
// - to know what PG IDs are already used to avoid collisions when creating new ones.
// Note that if there are one or more records that do not have a PG tag, then a null value
// will be stored in this set.
pgIdsSeen.add(rec.getStringAttribute(SAMTag.PG.name()));
}
if (rec.getReadUnmappedFlag()) {
if (rec.getReferenceIndex() == -1) {
// When we hit the unmapped reads with no coordinate, no reason to continue.
break;
}
// If this read is unmapped but sorted with the mapped reads, just skip it.
} else if (!rec.isSecondaryOrSupplementary()) {
final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, index, rec);
this.fragSort.add(fragmentEnd);
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
final String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName();
ReadEndsForMarkDuplicates pairedEnds = tmp.remove(rec.getReferenceIndex(), key);
// See if we've already seen the first end or not
if (pairedEnds == null) {
pairedEnds = buildReadEnds(header, index, rec);
tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds);
} else {
final int sequence = fragmentEnd.read1ReferenceIndex;
final int coordinate = fragmentEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
// before updating the orientation later.
if (rec.getFirstOfPairFlag()) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
}
// If the second read is actually later, just add the second read data, else flip the reads
if (sequence > pairedEnds.read1ReferenceIndex ||
(sequence == pairedEnds.read1ReferenceIndex && coordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = sequence;
pairedEnds.read2Coordinate = coordinate;
pairedEnds.read2IndexInFile = index;
pairedEnds.orientation = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R,
rec.getReadNegativeStrandFlag());
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = sequence;
pairedEnds.read1Coordinate = coordinate;
pairedEnds.read1IndexInFile = index;
pairedEnds.orientation = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(),
pairedEnds.orientation == ReadEnds.R);
}
pairedEnds.score += DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);
this.pairSort.add(pairedEnds);
}
}
}
// Print out some stats every 1m reads
++index;
if (progress.record(rec)) {
log.info("Tracking " + tmp.size() + " as yet unmatched pairs. " + tmp.sizeInRam() + " records in RAM.");
}
}
log.info("Read " + index + " records. " + tmp.size() + " pairs never matched.");