if (DEBUG) {
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
}
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
// if the read extends beyond the downstream (right) end of the reference window, clip it
if ( mustClipDownstream(read, refWindowStop) )
read = ReadClipper.hardClipByReadCoordinates(read, refWindowStop - read.getSoftStart() + 1, read.getReadLength() - 1);
// if the read extends beyond the upstream (left) end of the reference window, clip it
if ( mustClipUpstream(read, refWindowStart) )
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart);
if (read.isEmpty())
continue;
// hard-clip low quality ends - this may introduce extra H elements in CIGAR string
read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD );
if (read.isEmpty())
continue;
// get bases of candidate haplotypes that overlap with reads
final long readStart = read.getSoftStart();
final long readEnd = read.getSoftEnd();
// see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
// but they're actually consistent with the insertion!
// Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
// Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
final long eventStartPos = ref.getLocus().getStart();
// compute total number of clipped bases (soft or hard clipped) and only use them if necessary
final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength);
final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0;
final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
final byte [] unclippedReadBases = read.getReadBases();
final byte [] unclippedReadQuals = read.getBaseQualities();
/**
* Compute genomic locations that candidate haplotypes will span.
* Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord,
* adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above.
* We will propose haplotypes that overlap the read with some padding.
* True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read)
* Last term is because if a read starts with an insertion then these bases are not accounted for in readStart.
* trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to
* differentiate context between two haplotypes
*/
final int absEventLength = Math.abs(eventLength);
long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0);
long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength;
if (DEBUG)
System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) {
startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
}
else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) {
startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely;
}
// candidate haplotype cannot go beyond reference context
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
}
if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) {
stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap
}
// ok, we now figured out the total number of clipped bases on both ends.
// Figure out where we want to place the haplotype to score read against
if (DEBUG)
System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
// LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
/**
* Check if we'll end up with an empty read once all clipping is done
*/
if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) {
int j=0;
for (Allele a: haplotypeMap.keySet()) {
perReadAlleleLikelihoodMap.add(p,a,0.0);
readLikelihoods[readIdx][j++] = 0.0;
}
}
else {
final int endOfCopy = unclippedReadBases.length - numEndSoftClippedBases;
final byte[] readBases = Arrays.copyOfRange(unclippedReadBases, numStartSoftClippedBases, endOfCopy);
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals, numStartSoftClippedBases, endOfCopy);
int j=0;
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
// get homopolymer length profile for current haplotype
final int[] hrunProfile = new int[readBases.length];
getContextHomopolymerLength(readBases,hrunProfile);
fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
// get the base insertion and deletion qualities to use
final byte[] baseInsertionQualities, baseDeletionQualities;
if ( read.hasBaseIndelQualities() ) {
baseInsertionQualities = Arrays.copyOfRange(read.getBaseInsertionQualities(), numStartSoftClippedBases, endOfCopy);
baseDeletionQualities = Arrays.copyOfRange(read.getBaseDeletionQualities(), numStartSoftClippedBases, endOfCopy);
} else {
baseInsertionQualities = contextLogGapOpenProbabilities;
baseDeletionQualities = contextLogGapOpenProbabilities;
}
// Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
// Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
final Map<GATKSAMRecord,byte[]> readGCPArrayMap = Collections.singletonMap(processedRead,contextLogGapContinuationProbabilities);
// Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations