/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.util;
import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.util.IlluminaUtil.IlluminaAdapterPair;
import java.util.HashMap;
import java.util.Map;
/**
*
*/
public class ClippingUtilityTest {
@Test(dataProvider="clipTestData")
public void testBasicClip(final String testName, final String read, final String clip, final int minMatch, final double errRate, final int expected) {
final byte[] r = (read == null) ? null : StringUtil.stringToBytes(read);
final byte[] c = (clip == null) ? null : StringUtil.stringToBytes(clip);
final int result = ClippingUtility.findIndexOfClipSequence(r, c, minMatch, errRate);
Assert.assertEquals(result, expected, testName);
}
@Test(dataProvider = "clipTestData")
public void testSingleEndSamRecordClip(final String testName, final String read, final String clip, final int minMatch,
final double errRate, final int expected) {
if (read == null) return; // Silly case
final SingleEndAdapter adapter = new SingleEndAdapter(testName, clip);
for (final boolean reverse : new boolean[]{false, true}) {
final SAMRecord rec = new SAMRecord(null);
if (reverse) {
rec.setReadString(SequenceUtil.reverseComplement(read));
rec.setReadNegativeStrandFlag(true);
} else {
rec.setReadString(read);
}
final AdapterPair matchedAdapter = ClippingUtility.adapterTrimIlluminaSingleRead(rec, minMatch, errRate, adapter);
if (expected == -1) {
Assert.assertNull(matchedAdapter, testName);
Assert.assertNull(rec.getAttribute(ReservedTagConstants.XT), testName);
} else {
Assert.assertEquals(matchedAdapter, adapter, testName);
Assert.assertEquals(rec.getAttribute(ReservedTagConstants.XT), expected + 1, testName);
// Test that if minMatch is decreased, it still matches
for (int i = 1; i < minMatch - 3; ++i) {
Assert.assertEquals(ClippingUtility.adapterTrimIlluminaSingleRead(rec, minMatch - i, errRate, adapter), adapter, testName);
}
// Skip this test for high error rates, because almost anything will match.
if (errRate < 0.5) {
// Test that if minMatch is increased, it now fails to match
Assert.assertNull(ClippingUtility.adapterTrimIlluminaSingleRead(rec, minMatch + 1, errRate, adapter), testName);
// Test that if errRate is increased, it still matches
Assert.assertNotNull(ClippingUtility.adapterTrimIlluminaSingleRead(rec, minMatch, errRate + 0.1, adapter), testName);
}
// Very low error threshold should cause match failure
Assert.assertNull(ClippingUtility.adapterTrimIlluminaSingleRead(rec, minMatch, 0.01, adapter), testName);
}
}
}
@Test(dataProvider="clipPairedTestData")
public void testPairedEndClip(final String testName, final String read1, final String read2, final AdapterPair expected) {
final SAMRecord rec1 = new SAMRecord(new SAMFileHeader());
rec1.setReadString(read1);
rec1.setFirstOfPairFlag(true);
final SAMRecord rec2 = new SAMRecord(new SAMFileHeader());
rec2.setReadString(read2);
rec2.setSecondOfPairFlag(true);
final AdapterPair result = ClippingUtility.adapterTrimIlluminaPairedReads(rec1, rec2,
IlluminaAdapterPair.INDEXED, IlluminaAdapterPair.PAIRED_END);
if (result != null) {
Assert.assertEquals(result.getName(), expected.getName(), testName);
}
else {
Assert.assertNull(expected, testName);
}
}
@Test
public void testOneSidedMatchSupersededByTwoSidedMatch() {
final int readLength = 36;
final int adapterLength = 30;
final String read1 = patchAdapterSubsequenceIntoRead(makeBogusReadString(readLength), IlluminaAdapterPair.SINGLE_END.get3PrimeAdapterInReadOrder(), adapterLength);
final String read2 = patchAdapterSubsequenceIntoRead(makeBogusReadString(readLength), IlluminaAdapterPair.SINGLE_END.get5PrimeAdapterInReadOrder(), adapterLength);
final SAMRecord rec1 = new SAMRecord(null); rec1.setReadString(read1);
final SAMRecord rec2 = new SAMRecord(null); rec2.setReadString(read2);
AdapterPair result = ClippingUtility.adapterTrimIlluminaPairedReads(rec1, rec2, adapterLength - 2, 0.1,
IlluminaAdapterPair.PAIRED_END, IlluminaAdapterPair.SINGLE_END);
Assert.assertEquals(result, IlluminaAdapterPair.SINGLE_END);
// Confirm that without SINGLE_END, PAIRED_END would one-sided match, if match length is short enough.
result = ClippingUtility.adapterTrimIlluminaPairedReads(rec1, rec2, adapterLength/2, 0.1, IlluminaAdapterPair.PAIRED_END);
Assert.assertEquals(result, IlluminaAdapterPair.PAIRED_END);
// However, without shorter match length, one-sided match will fail.
Assert.assertNull(ClippingUtility.adapterTrimIlluminaPairedReads(rec1, rec2, adapterLength - 2, 0.1, IlluminaAdapterPair.PAIRED_END));
}
@Test(dataProvider = "testAdapterInAllReadPositionsDataProvider")
public void testAdapterInAllReadPositions(final int readLength) {
final int minAdapterLength = 6;
for (final IlluminaAdapterPair adapterPair : IlluminaAdapterPair.values()) {
final AdapterMarker marker = new AdapterMarker(adapterPair);
for (int adapterPosition = 0; adapterPosition < readLength; ++adapterPosition) {
final SAMRecord rec = createSamRecordWithAdapterSequence(readLength, adapterPair, adapterPosition);
final AdapterPair matchedAdapter = ClippingUtility.adapterTrimIlluminaSingleRead(rec, minAdapterLength, 0.1, adapterPair);
final Object xt = rec.getAttribute(ReservedTagConstants.XT);
rec.setAttribute(ReservedTagConstants.XT, null);
final AdapterPair truncatedAdapter = marker.adapterTrimIlluminaSingleRead(rec, minAdapterLength, 0.1);
final Object xtFromMarker = rec.getAttribute(ReservedTagConstants.XT);
if (adapterPosition <= readLength - minAdapterLength) {
Assert.assertEquals(matchedAdapter, adapterPair, "Failed at position " + adapterPosition);
Assert.assertEquals((Integer)xt, Integer.valueOf(adapterPosition + 1));
Assert.assertNotNull(truncatedAdapter);
Assert.assertEquals((Integer)xtFromMarker, Integer.valueOf(adapterPosition + 1));
} else {
Assert.assertNull(matchedAdapter);
Assert.assertNull(xt);
Assert.assertNull(truncatedAdapter);
Assert.assertNull(xtFromMarker);
}
}
}
}
private SAMRecord createSamRecordWithAdapterSequence(final int readLength, final IlluminaAdapterPair adapterPair, final int adapterPosition) {
final String adapterString = adapterPair.get3PrimeAdapterInReadOrder();
final int replacementLength = Math.min(adapterString.length(), readLength - adapterPosition);
final String adapterSubstring = adapterString.substring(0, replacementLength);
final String readBases = replaceSubstring(makeBogusReadString(readLength), adapterSubstring, adapterPosition, adapterSubstring.length());
final SAMRecord rec = new SAMRecord(null);
rec.setReadString(readBases);
return rec;
}
@DataProvider(name="testAdapterInAllReadPositionsDataProvider")
public Object[][] testAdapterInAllReadPositionsDataProvider() {
return new Object[][]{{100}, {36}};
}
@DataProvider(name="clipTestData")
public Object[][] getClipTestData() {
final String FORWARD = IlluminaAdapterPair.PAIRED_END.get3PrimeAdapter();
final String SE_FORWARD = IlluminaAdapterPair.SINGLE_END.get3PrimeAdapter();
final String REVERSE = IlluminaAdapterPair.PAIRED_END.get5PrimeAdapterInReadOrder();
return new Object[][] {
{"Simple test 1", "AAAAACCCCCAGATCGGAAGAGCA", "AGATCGGAAGAGCG", 14, 0.15, 10},
{"Simple test 2", "AAAAACCCCCGGGGGAGATCGGAAGAGCA", "AGATCGGAAGAGCG", 14, 0.15, 15},
{"No adapter", "AAAAACCCCCGGGGGTTTTT", "AGATCGGAAGAGCG", 6, 0.15, -1},
{"Partial adapter", "AAAAACCCCCTGATCGGAA", "AGATCGGAAGAGCG", 9, 0.15, 10},
// no longer support clips in middle of read
// {"Adapter+Primer", "AAAAACCCCCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG", "AGATCGGAAGAGCG", 6, 0.15, 10},
{"No sequence", null, "AGATCGGAAGAGCG", 6, 0.15, -1},
{"Read too short", "AGATCGGAAG", "AGATCGGAAGAGCG", 11, 0.15, -1},
{"All No Calls", "AAACCCNNNNNNNNNNNNNNNNNNNN", "AGATCGGAAGAGCG", 6, 0.15, -1},
{"From Test Data1", "CGGCATTCCTGCTGAACCGAGATCGGAAGAGCGTCGTGTAGGGAAAGGGGGTGGATCTCGGTGGGCGGCGTGTTGT", REVERSE, 57, 0.15, 19},
{"From Test Data2a", "CGGAAGAGCGGTTCAGCAGGAATGCCGAGATCCGAA", REVERSE, 9, 0.14, 27}, // XT:i:28
{"From Test Data2b", "CGGAAGAGCGGTTCAGCAGGAATGCCGAGATCGGAA", REVERSE, 10, 0.14, -1}, // only matches 9
{"From PE Test Data1", "CGGTTCAGCAGGAATGCCGAGATCGGAAGAGCGGGT", FORWARD, 17, 0.14, 19},
{"From PE Test Data2", "CGGTTCAGCAGGAATGCCGAGATCGGAAGAGCGGGT", REVERSE, 5, 0.14, -1},
new Object[] {"From Test 8-clipped", "TGGGGTGGTTATTGTTGATTTTTGTTTGTGTGTTAGGTTGTTTGTGTTAGTTTTTTATTTTATTTTCGAGATCGAA", FORWARD, 8, 0.14, 68},
{"50% can be bad", "AAAAACCCCCAGGTCGGAAGAGCG", "AGATCGGAAGAGCG", 5, 0.5, 18}, // 18?
{"From 30E54AAXX.5.a", "ATATCTGAAGATCTCGTATGCCGTCTTCTGCTTG", "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG", 34, 0.14, 0}, // 0!? From KT test case
{"From 30E54AAXX.5.b", "ATATCTGAAGATCTCGTATGCCGTCTTCTGCTTG", SE_FORWARD, 34, 0.14, 0}, // 1?? From KT test case
};
}
@DataProvider(name="clipPairedTestData")
public Object[][] getClipPairedTestData() {
return new Object[][] {
{"Basic positive paired test matching", "CTACTGGCGCTGAAACTGAGCAGCCAAGCAGATCGG", "GCTTGGCTGCTCAGTTTCAGCGCCAGTAGAGATCGG", IlluminaAdapterPair.INDEXED},
// This matches on first end and matches at higher stringency on that one side
{"Basic positive first end one-sided test matching", "CTACTGGCGCTGAAAAGATCGGAAGAGCGGTTCAGC", "AAAAAATTTTTTCCCCCCGGGGGGAAAAAATTTTTT", IlluminaAdapterPair.PAIRED_END},
// This matches on second end and matches at higher stringency on that one side
{"Basic positive second end one-sided test matching", "AAAAAATTTTTTCCCCCCGGGGGGAAAAAATTTTTT", "AAAAAATTTTTTCCCAGATCGGAAGAGCGTCGTGTA", IlluminaAdapterPair.PAIRED_END},
// This matches on one side and does not match at higher stringency on that one side
{"Basic negative one-sided test matching", "GGCGCTGAAACTACTGGCGCTGAAAAGATCGGAAGA", "AAAAAATTTTTTCCCCCCGGGGGGAAAAAATTTTTT", null},
// These match but at different positions. No clip should be done.
{"Mis-match paired test matching", "CTACTGGCGCTGAAACTGAGCAGCCAAGCAGATCGG", "AGCTTGGCTGCTCAGTTTCAGCGCCAGTAGAGATCG", null},
};
}
private static class SingleEndAdapter implements AdapterPair {
private final String name;
private final String threePrimeAdapter;
private SingleEndAdapter(final String name, final String threePrimeAdapter) {
this.name = name;
this.threePrimeAdapter = threePrimeAdapter;
}
@Override
public String get3PrimeAdapter() {
return threePrimeAdapter;
}
@Override
public String get3PrimeAdapterInReadOrder() {
return threePrimeAdapter;
}
@Override
public byte[] get3PrimeAdapterBytes() {
return StringUtil.stringToBytes(threePrimeAdapter);
}
@Override
public byte[] get3PrimeAdapterBytesInReadOrder() {
return get3PrimeAdapterBytes();
}
@Override
public String get5PrimeAdapter() {
throw new UnsupportedOperationException();
}
@Override
public String get5PrimeAdapterInReadOrder() {
throw new UnsupportedOperationException();
}
@Override
public byte[] get5PrimeAdapterBytes() {
throw new UnsupportedOperationException();
}
@Override
public byte[] get5PrimeAdapterBytesInReadOrder() {
throw new UnsupportedOperationException();
}
@Override
public String getName() {
return name;
}
}
/**
* @return A String of given length with 6 As, 6 Cs, 6 Gs, etc. until length is reached.
*/
private static String makeBogusReadString(final int len) {
final StringBuilder builder = new StringBuilder(len);
final Map<Character, Character> nextChar = new HashMap<Character, Character>();
nextChar.put('A', 'C');
nextChar.put('C', 'G');
nextChar.put('G', 'T');
nextChar.put('T', 'A');
for (char curChar = 'A'; true; curChar = nextChar.get(curChar)) {
for (int i = 0; i < 6; ++i) {
if (builder.length() == len) return builder.toString();
builder.append(curChar);
}
}
}
private static String patchAdapterSubsequenceIntoRead(final String read, final String adapter, final int length) {
return replaceSubstring(read, adapter.substring(0, length), read.length() - length, read.length());
}
private static String replaceSubstring(final String s, final String replacement, final int position, final int charsToReplace) {
final StringBuilder sb = new StringBuilder(s);
sb.replace(position, position + charsToReplace, replacement);
return sb.toString();
}
@Test
public void testAdapterTruncation() {
final AdapterMarker marker = new AdapterMarker(30,
IlluminaUtil.IlluminaAdapterPair.INDEXED,
IlluminaUtil.IlluminaAdapterPair.DUAL_INDEXED,
IlluminaUtil.IlluminaAdapterPair.NEXTERA_V2,
IlluminaUtil.IlluminaAdapterPair.FLUIDIGM);
// INDEXED and DUAL_INDEXED should collapse at length 30
Assert.assertTrue(marker.getAdapters().length < 4, "Did not collapse: " + marker.getAdapters().length);
}
/**
* Confirm that after the requisite number of sightings of adapter, the list is trimmed
*/
@Test(dataProvider = "testAdapterListTruncationDataProvider")
public void testAdapterListTruncation(final IlluminaAdapterPair adapterPair) {
final int thresholdForTruncatingAdapterList = 20;
final int readLength = 100;
// Throw all the adapter pairs into the marker. There is some danger in doing this because
// IlluminaAdapterPair.ALTERNATIVE_SINGLE_END has 3' that is a substring of some of the others. In the enum it appears
// last so it is tested last.
final AdapterMarker marker =
new AdapterMarker(IlluminaUtil.IlluminaAdapterPair.values()).
setThresholdForSelectingAdaptersToKeep(thresholdForTruncatingAdapterList);
int adapterPosition = 1;
Assert.assertTrue(adapterPosition + thresholdForTruncatingAdapterList < readLength - ClippingUtility.MIN_MATCH_BASES,
"Test is configured improperly -- eventually will not be enough adapter bases in read.");
int originalNumberOfAdapters = marker.getAdapters().length;
for (int i = 0; i < thresholdForTruncatingAdapterList; ++i) {
// First, a read with no adapter in it.
SAMRecord rec = new SAMRecord(null);
rec.setReadString(makeBogusReadString(readLength));
AdapterPair matchedPair = marker.adapterTrimIlluminaSingleRead(rec);
Assert.assertNull(matchedPair);
Assert.assertNull(rec.getAttribute(ReservedTagConstants.XT));
// Adapter list should not be truncated yet
Assert.assertEquals(marker.getAdapters().length, originalNumberOfAdapters);
// Then, a record with some adapter sequence in it.
rec = createSamRecordWithAdapterSequence(readLength, adapterPair, adapterPosition);
matchedPair = marker.adapterTrimIlluminaSingleRead(rec);
Assert.assertNotNull(matchedPair);
Assert.assertEquals(rec.getIntegerAttribute(ReservedTagConstants.XT).intValue(), adapterPosition + 1,
rec.getReadString() + " matched " + matchedPair);
// Put adapter in different place next time.
adapterPosition++;
}
Assert.assertEquals(marker.getAdapters().length, 1, "Did not truncate adapter list to 1 element");
Assert.assertTrue(marker.getAdapters()[0].getName().contains(adapterPair.getName()),
String.format("Expected '%s' to contain '%s'", marker.getAdapters()[0].getName(), adapterPair.getName()));
}
@DataProvider(name="testAdapterListTruncationDataProvider")
public Object[][] testAdapterListTruncationDataProvider() {
Object[][] ret = new Object[IlluminaAdapterPair.values().length][];
for (int i = 0; i < ret.length; ++i) {
ret[i] = new Object[]{IlluminaAdapterPair.values()[i]};
}
return ret;
}
}