feature.setChr(tokens[0]);
feature.setStart(Integer.parseInt(tokens[1]));
if(tokens[2].length() != 1) {
throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
}
feature.setRef(tokens[2].charAt(0));
switch (count) {
case basicTokenCount:
bases = tokens[4];
quals = tokens[5];
// parsing is pretty straightforward for 6-col format
if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format
throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format");
} else {
parseBasesAndQuals(feature, bases, quals);
feature.setRefBases(tokens[2].toUpperCase());
feature.setEnd(feature.getStart());
}
break;
case consensusSNPTokenCount: // pileup called a SNP or a reference base
observedString = tokens[3].toUpperCase();
feature.setFWDAlleles(new ArrayList<String>(2));
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
bases = tokens[8];
quals = tokens[9];
// confirm that we have a non-variant, not a mis-parsed indel
if ( feature.getRef() == '*' ) {
throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel");
}
// Parse the SNP or non-variant
parseBasesAndQuals(feature, bases, quals);
if ( observedString.length() != 1 ) {
throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString);
}
feature.setRefBases(tokens[2].toUpperCase());
feature.setEnd(feature.getStart());
char ch = observedString.charAt(0);
switch ( ch ) { // record alleles (decompose ambiguous base codes)
case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
}
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
else {
// we know that at least one allele is non-ref;
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
feature.setVariantType(VariantType.SNP);
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
) feature.setNumNonRef(1);
else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
}
break;
case consensusIndelTokenCount:
observedString = tokens[3].toUpperCase();
feature.setFWDAlleles(new ArrayList<String>(2));
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
// confirm that we have an indel, not a mis-parsed SNP or non-variant
if ( feature.getRef() != '*' ) {
throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'");
}
// Parse the indel
parseIndels(observedString,feature) ;
if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!!
break;
default:
throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " +
"(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount +
" (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount +
" (consensus pileup for an indel); saw = " + count + " on line = " + line + ")");
}
return feature;