package picard.reference;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.RuntimeIOException;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Fasta;
import picard.cmdline.StandardOptionDefinitions;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
/**
* Little program to "normalize" a fasta file to ensure that all line of sequence are the
* same length, and are a reasonable length!
*/
@CommandLineProgramProperties(
usage = "Takes any file that conforms to the fasta format and " +
"normalizes it so that all lines of sequence except the last line per named sequence " +
"are of the same length.",
usageShort = "Normalizes lines of sequence in a fasta file to be of the same length",
programGroup = Fasta.class
)
public class NormalizeFasta extends CommandLineProgram {
@Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The input fasta file to normalize.")
public File INPUT;
@Option(shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The output fasta file to write.")
public File OUTPUT;
@Option(doc="The line length to be used for the output fasta file.")
public int LINE_LENGTH=100;
@Option(doc="Truncate sequence names at first whitespace.")
public boolean TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE=false;
private final Log log = Log.getInstance(NormalizeFasta.class);
public static void main(final String[] args) {
new NormalizeFasta().instanceMainWithExit(args);
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
if (INPUT.getAbsoluteFile().equals(OUTPUT.getAbsoluteFile())) {
throw new IllegalArgumentException("Input and output cannot be the same file.");
}
final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(INPUT, TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE);
final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT);
ReferenceSequence seq = null;
while ((seq = ref.nextSequence()) != null) {
final String name = seq.getName();
final byte[] bases = seq.getBases();
try {
out.write(">");
out.write(name);
out.newLine();
if (bases.length == 0) {
log.warn("Sequence " + name + " contains 0 bases.");
}
else {
for (int i=0; i<bases.length; ++i) {
if (i > 0 && i % LINE_LENGTH == 0) out.write("\n");
out.write(bases[i]);
}
out.write("\n");
}
}
catch (IOException ioe) {
throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe);
}
}
try {
out.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return 0;
}
}