/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.alignment.bwa;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.StringUtil;
import org.broadinstitute.gatk.engine.alignment.reference.bwt.*;
import org.broadinstitute.gatk.engine.alignment.reference.packing.PackUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import java.io.File;
import java.io.IOException;
/**
* Support files for BWT.
*
* @author mhanna
* @version 0.1
*/
public class BWTFiles {
/**
* ANN (?) file name.
*/
public final File annFile;
/**
* AMB (?) file name.
*/
public final File ambFile;
/**
* Packed reference sequence file.
*/
public final File pacFile;
/**
* Reverse of packed reference sequence file.
*/
public final File rpacFile;
/**
* Forward BWT file.
*/
public final File forwardBWTFile;
/**
* Forward suffix array file.
*/
public final File forwardSAFile;
/**
* Reverse BWT file.
*/
public final File reverseBWTFile;
/**
* Reverse suffix array file.
*/
public final File reverseSAFile;
/**
* Where these files autogenerated on the fly?
*/
public final boolean autogenerated;
/**
* Create a new BWA configuration file using the given prefix.
* @param prefix Prefix to use when creating the configuration. Must not be null.
*/
public BWTFiles(String prefix) {
if(prefix == null)
throw new ReviewedGATKException("Prefix must not be null.");
annFile = new File(prefix + ".ann");
ambFile = new File(prefix + ".amb");
pacFile = new File(prefix + ".pac");
rpacFile = new File(prefix + ".rpac");
forwardBWTFile = new File(prefix + ".bwt");
forwardSAFile = new File(prefix + ".sa");
reverseBWTFile = new File(prefix + ".rbwt");
reverseSAFile = new File(prefix + ".rsa");
autogenerated = false;
}
/**
* Hand-create a new BWTFiles object, specifying a unique file object for each type.
* @param annFile ANN (alternate dictionary) file.
* @param ambFile AMB (holes) files.
* @param pacFile Packed representation of the forward reference sequence.
* @param forwardBWTFile BWT representation of the forward reference sequence.
* @param forwardSAFile SA representation of the forward reference sequence.
* @param rpacFile Packed representation of the reversed reference sequence.
* @param reverseBWTFile BWT representation of the reversed reference sequence.
* @param reverseSAFile SA representation of the reversed reference sequence.
*/
private BWTFiles(File annFile,
File ambFile,
File pacFile,
File forwardBWTFile,
File forwardSAFile,
File rpacFile,
File reverseBWTFile,
File reverseSAFile) {
this.annFile = annFile;
this.ambFile = ambFile;
this.pacFile = pacFile;
this.forwardBWTFile = forwardBWTFile;
this.forwardSAFile = forwardSAFile;
this.rpacFile = rpacFile;
this.reverseBWTFile = reverseBWTFile;
this.reverseSAFile = reverseSAFile;
autogenerated = true;
}
/**
* Close out this files object, in the process deleting any temporary filse
* that were created.
*/
public void close() {
if(autogenerated) {
boolean success = true;
success = annFile.delete();
success &= ambFile.delete();
success &= pacFile.delete();
success &= forwardBWTFile.delete();
success &= forwardSAFile.delete();
success &= rpacFile.delete();
success &= reverseBWTFile.delete();
success &= reverseSAFile.delete();
if(!success)
throw new ReviewedGATKException("Unable to clean up autogenerated representation");
}
}
/**
* Create a new set of BWT files from the given reference sequence.
* @param referenceSequence Sequence from which to build metadata.
* @return A new object representing encoded representations of each sequence.
*/
public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
byte[] normalizedReferenceSequence = new byte[referenceSequence.length];
System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length);
normalizeReferenceSequence(normalizedReferenceSequence);
File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
try {
// Write the ann and amb for this reference sequence.
annFile = File.createTempFile("bwt",".ann");
ambFile = File.createTempFile("bwt",".amb");
SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length));
ANNWriter annWriter = new ANNWriter(annFile);
annWriter.write(dictionary);
annWriter.close();
AMBWriter ambWriter = new AMBWriter(ambFile);
ambWriter.writeEmpty(dictionary);
ambWriter.close();
// Write the encoded files for the forward version of this reference sequence.
pacFile = File.createTempFile("bwt",".pac");
bwtFile = File.createTempFile("bwt",".bwt");
saFile = File.createTempFile("bwt",".sa");
writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile);
// Write the encoded files for the reverse version of this reference sequence.
byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence);
rpacFile = File.createTempFile("bwt",".rpac");
rbwtFile = File.createTempFile("bwt",".rbwt");
rsaFile = File.createTempFile("bwt",".rsa");
writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
}
catch(IOException ex) {
throw new ReviewedGATKException("Unable to write autogenerated reference sequence to temporary files");
}
// Make sure that, at the very least, all temporary files are deleted on exit.
annFile.deleteOnExit();
ambFile.deleteOnExit();
pacFile.deleteOnExit();
bwtFile.deleteOnExit();
saFile.deleteOnExit();
rpacFile.deleteOnExit();
rbwtFile.deleteOnExit();
rsaFile.deleteOnExit();
return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile);
}
/**
* Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
* sequence is the reference itself in PAC format, the BWT, and the suffix array.
* @param referenceSequence The reference sequence to encode.
* @param pacFile Target for the PAC-encoded reference.
* @param bwtFile Target for the BWT representation of the reference.
* @param suffixArrayFile Target for the suffix array encoding of the reference.
* @throws java.io.IOException In case of issues writing to the file.
*/
private static void writeEncodedReferenceSequence(byte[] referenceSequence,
File pacFile,
File bwtFile,
File suffixArrayFile) throws IOException {
PackUtils.writeReferenceSequence(pacFile,referenceSequence);
BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
BWTWriter bwtWriter = new BWTWriter(bwtFile);
bwtWriter.write(bwt);
bwtWriter.close();
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
suffixArrayWriter.write(suffixArray);
suffixArrayWriter.close();
}
/**
* Convert the given reference sequence into a form suitable for building into
* on-the-fly sequences.
* @param referenceSequence The reference sequence to normalize.
* @throws org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException if normalized sequence cannot be generated.
*/
private static void normalizeReferenceSequence(byte[] referenceSequence) {
StringUtil.toUpperCase(referenceSequence);
for(byte base: referenceSequence) {
if(base != 'A' && base != 'C' && base != 'G' && base != 'T')
throw new ReviewedGATKException(String.format("Base type %c is not supported when building references on-the-fly",(char)base));
}
}
}