/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Fasta;
import picard.cmdline.StandardOptionDefinitions;
import java.io.File;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no
* SAMRecords, and the header contains only sequence records.
*/
@CommandLineProgramProperties(
usage = "Read fasta or fasta.gz containing reference sequences, and write as a SAM or BAM file with only sequence dictionary.\n",
usageShort = "Creates a SAM or BAM file from reference sequence in fasta format",
programGroup = Fasta.class
)
public class CreateSequenceDictionary extends CommandLineProgram {
// The following attributes define the command-line arguments
@Option(doc = "Input reference fasta or fasta.gz", shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME)
public File REFERENCE;
@Option(doc = "Output SAM or BAM file containing only the sequence dictionary",
shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME)
public File OUTPUT;
@Option(doc = "Put into AS field of sequence dictionary entry if supplied", optional = true)
public String GENOME_ASSEMBLY;
@Option(doc = "Put into UR field of sequence dictionary entry. If not supplied, input reference file is used",
optional = true)
public String URI;
@Option(doc = "Put into SP field of sequence dictionary entry", optional = true)
public String SPECIES;
@Option(doc = "Make sequence name the first word from the > line in the fasta file. " +
"By default the entire contents of the > line is used, excluding leading and trailing whitespace.")
public boolean TRUNCATE_NAMES_AT_WHITESPACE = true;
@Option(doc = "Stop after writing this many sequences. For testing.")
public int NUM_SEQUENCES = Integer.MAX_VALUE;
private final MessageDigest md5;
public CreateSequenceDictionary() {
try {
md5 = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new PicardException("MD5 algorithm not found", e);
}
}
public static void main(final String[] argv) {
System.exit(new CreateSequenceDictionary().instanceMain(argv));
}
/**
* Use reference filename to create URI to go into header if URI was not passed on cmd line.
*/
protected String[] customCommandLineValidation() {
if (URI == null) {
URI = "file:" + REFERENCE.getAbsolutePath();
}
return null;
}
/**
* Do the work after command line has been parsed.
* RuntimeException may be thrown by this method, and are reported appropriately.
*
* @return program exit status.
*/
protected int doWork() {
if (OUTPUT.exists()) {
throw new PicardException(OUTPUT.getAbsolutePath() +
" already exists. Delete this file and try again, or specify a different output file.");
}
final SAMSequenceDictionary sequences = makeSequenceDictionary(REFERENCE);
final SAMFileHeader samHeader = new SAMFileHeader();
samHeader.setSequenceDictionary(sequences);
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(samHeader, false, OUTPUT);
samWriter.close();
return 0;
}
/**
* Read all the sequences from the given reference file, and convert into SAMSequenceRecords
* @param referenceFile fasta or fasta.gz
* @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
*/
SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) {
final ReferenceSequenceFile refSeqFile =
ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, TRUNCATE_NAMES_AT_WHITESPACE);
ReferenceSequence refSeq;
final List<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
final Set<String> sequenceNames = new HashSet<String>();
for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) {
if (sequenceNames.contains(refSeq.getName())) {
throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName());
}
sequenceNames.add(refSeq.getName());
ret.add(makeSequenceRecord(refSeq));
}
return new SAMSequenceDictionary(ret);
}
/**
* Create one SAMSequenceRecord from a single fasta sequence
*/
private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) {
final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), refSeq.length());
// Compute MD5 of upcased bases
final byte[] bases = refSeq.getBases();
for (int i = 0; i < bases.length; ++i) {
bases[i] = StringUtil.toUpperCase(bases[i]);
}
ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
if (GENOME_ASSEMBLY != null) {
ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
}
ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
if (SPECIES != null) {
ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
}
return ret;
}
private String md5Hash(final byte[] bytes) {
md5.reset();
md5.update(bytes);
String s = new BigInteger(1, md5.digest()).toString(16);
if (s.length() != 32) {
final String zeros = "00000000000000000000000000000000";
s = zeros.substring(0, 32 - s.length()) + s;
}
return s;
}
}