/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMFormatException;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.IOUtil;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
import picard.PicardException;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
/**
* Tests for SamToFastq
*/
public class SamToFastqTest extends CommandLineProgramTest {
private static final File TEST_DATA_DIR = new File("testdata/picard/sam/bam2fastq/paired");
private static final String CLIPPING_TEST_DATA = "ok/clipping_test.sam";
public String getCommandLineProgramName() {
return SamToFastq.class.getSimpleName();
}
@DataProvider(name = "okFiles")
public Object[][] okFiles() {
return new Object[][] {
{"ok/sorted-pair.sam"}, // 5 sorted pairs (10 records) - mate1, mate2
{"ok/sorted-pair-no-rg.sam"}, // 5 sorted pairs (10 records) - mate1, mate2, no read group
{"ok/last-pair-mates-flipped.sam" }, // 4 pairs, :05 mate2, mate1
{"ok/first-mate-bof-last-mate-eof.sam"}, // :01 mate1, 4 pairs, :01 mate2
};
}
@DataProvider(name = "badFiles")
public Object[][] badFiles() {
return new Object[][] {
{"bad/unpaired-mate.sam"} // mate1 without its mate2
};
}
private void convertFile(final String [] args) {
Assert.assertEquals(runPicardCommandLine(args), 0);
}
@Test(dataProvider = "clippingTests")
public void testClipping(final String clippingAction, final String bases1_1, final String quals1_1, final String bases1_2, final String quals1_2,
final String bases2_1, final String quals2_1, final String bases2_2, final String quals2_2, final String testName) throws IOException {
final File samFile = new File(TEST_DATA_DIR, CLIPPING_TEST_DATA) ;
final File f1 = File.createTempFile("clippingtest1", "fastq");
final File f2 = File.createTempFile("clippingtest2", "fastq");
f1.deleteOnExit();
f2.deleteOnExit();
if (clippingAction != null) {
convertFile(new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"FASTQ=" + f1.getAbsolutePath(),
"SECOND_END_FASTQ=" + f2.getAbsolutePath(),
"CLIPPING_ACTION=" + clippingAction,
"CLIPPING_ATTRIBUTE=" + "XT"
});
} else {
convertFile(new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"FASTQ=" + f1.getAbsolutePath(),
"SECOND_END_FASTQ=" + f2.getAbsolutePath(),
});
}
Iterator<FastqRecord> it = new FastqReader(f1).iterator();
FastqRecord first = it.next();
Assert.assertEquals(first.getReadString(), bases1_1, testName);
Assert.assertEquals(first.getBaseQualityString(), quals1_1, testName);
FastqRecord second = it.next();
Assert.assertEquals(second.getReadString(), bases1_2, testName);
Assert.assertEquals(second.getBaseQualityString(), quals1_2, testName);
it = new FastqReader(f2).iterator();
first = it.next();
Assert.assertEquals(first.getReadString(), bases2_1, testName);
Assert.assertEquals(first.getBaseQualityString(), quals2_1, testName);
second = it.next();
Assert.assertEquals(second.getReadString(), bases2_2, testName);
Assert.assertEquals(second.getBaseQualityString(), quals2_2, testName);
}
@DataProvider(name = "clippingTests")
public Object[][] clippingTests() {
return new Object[][] {
{null, "AAAAAAAAAA", "1111111111", "AAAAAAAAAA", "1111111111", "CCCCCCCCCC", "2222222222", "GGGGGGGGGG", "2222222222", "No clipping test"},
{"X", "AAAAAAA", "1111111", "AAAAAA", "111111", "CCCCCCCC", "22222222", "GGGGGG", "222222", "Cut clipped bases test"},
{"N", "AAAAAAANNN", "1111111111", "AAAAAANNNN", "1111111111", "CCCCCCCCNN", "2222222222", "GGGGGGNNNN", "2222222222", "Mask clipped bases test"},
{"2", "AAAAAAAAAA", "1111111###", "AAAAAAAAAA", "111111####", "CCCCCCCCCC", "22222222##", "GGGGGGGGGG", "222222####", "Change clipped qualities test"}
};
}
@Test(dataProvider = "okFiles")
public void testOkFile(final String samFilename) throws IOException {
final File samFile = new File(TEST_DATA_DIR,samFilename);
final File pair1File = newTempFastqFile("pair1");
final File pair2File = newTempFastqFile("pair2");
convertFile(new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"FASTQ=" + pair1File.getAbsolutePath(),
"SECOND_END_FASTQ=" + pair2File.getAbsolutePath()
});
// Check that paired fastq files are same size
final Set<String> outputHeaderSet1 = createFastqReadHeaderSet(pair1File);
final Set<String> outputHeaderSet2 = createFastqReadHeaderSet(pair2File);
Assert.assertEquals(outputHeaderSet1.size(), outputHeaderSet2.size());
// Create map of mate pairs from SAM records
final Map<String,MatePair> map = createSamMatePairsMap(samFile) ;
Assert.assertEquals(map.size(), outputHeaderSet2.size());
// Ensure that each mate of each pair in SAM file is in the correct fastq pair file
for (final Map.Entry<String,MatePair> entry : map.entrySet() ) {
final MatePair mpair = entry.getValue();
Assert.assertNotNull(mpair.mate1); // ensure we have two mates
Assert.assertNotNull(mpair.mate2);
Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
final String readName = mpair.mate1.getReadName() ;
Assert.assertTrue(outputHeaderSet1.contains(readName+"/1")); // ensure mate is in correct file
Assert.assertTrue(outputHeaderSet2.contains(readName+"/2"));
}
}
@Test(dataProvider = "okFiles")
public void testOkInterleavedFile(final String samFilename) throws IOException {
final File samFile = new File(TEST_DATA_DIR,samFilename);
final File pairFile = newTempFastqFile("pair");
convertFile(new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"FASTQ=" + pairFile.getAbsolutePath(),
"INTERLEAVE=true"
});
final Set<String> outputHeaderSet = createFastqReadHeaderSet(pairFile);
// Create map of mate pairs from SAM records
final Map<String,MatePair> map = createSamMatePairsMap(samFile) ;
Assert.assertEquals(map.size() * 2, outputHeaderSet.size());
// Ensure that each mate of each pair in SAM file is in the correct fastq pair file
for (final Map.Entry<String,MatePair> entry : map.entrySet() ) {
final MatePair mpair = entry.getValue();
Assert.assertNotNull(mpair.mate1); // ensure we have two mates
Assert.assertNotNull(mpair.mate2);
Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
final String readName = mpair.mate1.getReadName() ;
Assert.assertTrue(outputHeaderSet.contains(readName+"/1")); // ensure mate is in correct file
Assert.assertTrue(outputHeaderSet.contains(readName+"/2"));
}
}
@Test (dataProvider = "badFiles", expectedExceptions= SAMFormatException.class)
public void testBadFile(final String samFilename) throws IOException {
final File samFile = new File(TEST_DATA_DIR,samFilename);
final File pair1 = File.createTempFile("tt-pair1.", ".fastq");
final File pair2 = File.createTempFile("tt-pair2.", ".fastq");
pair1.deleteOnExit();
pair2.deleteOnExit();
convertFile(new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"FASTQ=" + pair1.getAbsolutePath(),
"SECOND_END_FASTQ=" + pair2.getAbsolutePath()
});
}
@DataProvider(name = "okGroupedFiles")
public Object[][] okGroupedFiles() {
return new Object[][] {
{"ok/grouped-last-pair-mates-flipped.sam", null, null, new String[]{"rg1","rg2"}},
};
}
@DataProvider(name = "badGroupedFiles")
public Object[][] badGroupedFiles() {
return new Object[][] {
{"bad/grouped-unpaired-mate.sam", null, null, new String[]{"rg1.fastq","rg2.fastq"}}
};
}
@Test(dataProvider = "okGroupedFiles")
public void testOkGroupedFiles(final String samFilename, final String fastq, final String secondEndFastq,
final String [] groupFiles) throws IOException {
final File samFile = new File(TEST_DATA_DIR,samFilename);
final Map<String, Set<String>> outputSets = new HashMap<String, Set<String>>(groupFiles.length);
final String tmpDir = IOUtil.getDefaultTmpDir().getAbsolutePath() + "/";
final String [] args = new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT_PER_RG=true",
"OUTPUT_DIR=" + tmpDir,
};
runPicardCommandLine(args);
File f1;
File f2;
String fname1;
String fname2;
String keyName1;
String keyName2;
Set<String> outputHeaderSet1;
Set<String> outputHeaderSet2;
for(final String groupPUName : groupFiles)
{
keyName1 = groupPUName + "_1";
keyName2 = groupPUName + "_2";
fname1 = tmpDir + "/" + keyName1 + ".fastq";
fname2 = tmpDir + "/" + keyName2 + ".fastq";
f1 = new File(fname1);
f2 = new File(fname2);
f1.deleteOnExit();
f2.deleteOnExit();
IOUtil.assertFileIsReadable(f1);
IOUtil.assertFileIsReadable(f2);
// Check that paired fastq files are same size and store them for later comparison
outputHeaderSet1 = createFastqReadHeaderSet(f1);
outputHeaderSet2 = createFastqReadHeaderSet(f2);
outputSets.put(keyName1 , outputHeaderSet1);
outputSets.put(keyName2, outputHeaderSet2);
Assert.assertEquals(outputHeaderSet1.size(), outputHeaderSet2.size());
}
// Create map of read groups and mate pairs from SAM records
final Map<String, Map<String,MatePair>> map = createPUPairsMap(samFile);
for(final Map.Entry<String, Map<String, MatePair>> groupEntry : map.entrySet()) {
// Ensure that for each group, each mate of each pair in the SAM file is in the correct fastq pair file
for (final Map.Entry<String,MatePair> entry : groupEntry.getValue().entrySet() ) {
final MatePair mpair = entry.getValue();
outputHeaderSet1 = outputSets.get(groupEntry.getKey() + "_1");
outputHeaderSet2 = outputSets.get(groupEntry.getKey() + "_2");
Assert.assertNotNull(mpair.mate1); // ensure we have two mates
Assert.assertNotNull(mpair.mate2);
Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
final String readName = mpair.mate1.getReadName() ;
Assert.assertTrue(outputHeaderSet1.contains(readName+"/1")); // ensure mate is in correct file
Assert.assertTrue(outputHeaderSet2.contains(readName+"/2"));
}
}
}
@Test (dataProvider = "badGroupedFiles", expectedExceptions= SAMException.class)
public void testBadGroupedFile(final String samFilename, final String fastq, final String secondEndFastq,
final String [] groupFiles) throws IOException {
final File samFile = new File(TEST_DATA_DIR,samFilename);
final String tmpDir = IOUtil.getDefaultTmpDir().getAbsolutePath() + "/";
final String [] args = new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT_PER_RG=true",
"OUTPUT_DIR=" + tmpDir,
};
runPicardCommandLine(args);
File f1;
File f2;
String fname1;
String fname2;
for(final String groupPUName : groupFiles)
{
fname1 = tmpDir + groupPUName + "_1.fastq";
fname2 = tmpDir + groupPUName + "_2.fastq";
f1 = new File(fname1);
f2 = new File(fname2);
f1.deleteOnExit();
f1.deleteOnExit();
}
}
@Test(dataProvider = "trimmedData")
public void testTrimming(final String samFilename, final int read1Trim,
final int read1MaxBases, final int expectedRead1Length, final int read2Trim,
final int read2MaxBases, final int expectedRead2Length) throws IOException {
final File samFile = new File(TEST_DATA_DIR, samFilename);
final File pair1File = newTempFastqFile("pair1");
final File pair2File = newTempFastqFile("pair2");
pair1File.deleteOnExit();
pair2File.deleteOnExit();
convertFile(new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"FASTQ=" + pair1File.getAbsolutePath(),
"SECOND_END_FASTQ=" + pair2File.getAbsolutePath(),
"READ1_TRIM=" + read1Trim,
"READ1_MAX_BASES_TO_WRITE=" + read1MaxBases,
"READ2_TRIM=" + read2Trim,
"READ2_MAX_BASES_TO_WRITE=" + read2MaxBases
});
for (final FastqRecord first : new FastqReader(pair1File)) {
Assert.assertEquals(first.getReadString().length(), expectedRead1Length, "Incorrect read length");
Assert.assertEquals(first.getBaseQualityString().length(), expectedRead1Length, "Incorrect quality string length");
}
for (final FastqRecord second : new FastqReader(pair2File)) {
Assert.assertEquals(second.getReadString().length(), expectedRead2Length, "Incorrect read length");
Assert.assertEquals(second.getBaseQualityString().length(), expectedRead2Length, "Incorrect quality string length");
}
}
@DataProvider(name = "trimmedData")
public Object[][] trimmedData() {
return new Object[][] {
// There are 13 bases in each of these reads
{"ok/sorted-pair.sam", 6, 7, 7, 5, 8, 8}, // exact matches for everything
{"ok/sorted-pair.sam", 7, 7, 6, 3, 8, 8} // fewer or more bases
};
}
private Set<String> createFastqReadHeaderSet(final File file) {
final Set<String> set = new HashSet<String>();
final FastqReader freader = new FastqReader(file);
while (freader.hasNext()) {
final FastqRecord frec = freader.next();
set.add(frec.getReadHeader());
}
return set ;
}
private Map<String,MatePair> createSamMatePairsMap(final File samFile) throws IOException {
IOUtil.assertFileIsReadable(samFile);
final SAMFileReader reader = new SAMFileReader(IOUtil.openFileForReading(samFile));
final Map<String,MatePair> map = new LinkedHashMap<String,MatePair>();
for (final SAMRecord record : reader ) {
MatePair mpair = map.get(record.getReadName());
if (mpair == null) {
mpair = new MatePair();
map.put(record.getReadName(), mpair);
}
mpair.add(record);
}
reader.close();
return map;
}
private Map<String, Map<String, MatePair>> createPUPairsMap(final File samFile) throws IOException {
IOUtil.assertFileIsReadable(samFile);
final SAMFileReader reader = new SAMFileReader(IOUtil.openFileForReading(samFile));
final Map<String, Map<String, MatePair>> map = new LinkedHashMap<String, Map<String,MatePair>>();
Map<String,MatePair> curFileMap;
for (final SAMRecord record : reader ) {
final String platformUnit = record.getReadGroup().getPlatformUnit();
curFileMap = map.get(platformUnit);
if(curFileMap == null)
{
curFileMap = new LinkedHashMap<String, MatePair>();
map.put(platformUnit, curFileMap);
}
MatePair mpair = curFileMap.get(record.getReadName());
if (mpair == null) {
mpair = new MatePair();
curFileMap.put(record.getReadName(), mpair);
}
mpair.add(record);
}
reader.close();
return map;
}
class MatePair {
SAMRecord mate1 ;
SAMRecord mate2 ;
void add(final SAMRecord record) {
if (!record.getReadPairedFlag()) throw new PicardException("Record "+record.getReadName()+" is not paired");
if (record.getFirstOfPairFlag()) {
if (mate1 != null) throw new PicardException("Mate 1 already set for record: "+record.getReadName());
mate1 = record ;
}
else if (record.getSecondOfPairFlag()) {
if (mate2 != null) throw new PicardException("Mate 2 already set for record: "+record.getReadName());
mate2 = record ;
}
else throw new PicardException("Neither FirstOfPairFlag or SecondOfPairFlag is set for a paired record");
}
}
private File newTempFastqFile(final String filename) throws IOException {
if(filename == null) return null;
final File file = File.createTempFile(filename,".fastq");
file.deleteOnExit();
return file;
}
}