/*
* Created on Feb 20, 2006
*
*/
/**
* This is a simple command line class to do classification.
*/
package edu.msu.cme.rdp.classifier;
import edu.msu.cme.rdp.classifier.cli.CmdOptions;
import edu.msu.cme.rdp.classifier.io.ClassificationResultFormatter;
import edu.msu.cme.rdp.classifier.utils.ClassifierFactory;
import edu.msu.cme.rdp.classifier.utils.ClassifierSequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import edu.msu.cme.rdp.readseq.readers.SeqReader;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
/**
* This is the legacy command line class to do the classification. See edu.msu.cme.rdp.multicompare.main
* @author wangqion
*/
public class ClassifierCmd {
private static final Options options = new Options();
static {
options.addOption(new Option(CmdOptions.QUERYFILE_SHORT_OPT, CmdOptions.QUERYFILE_LONG_OPT, false, CmdOptions.QUERYFILE_DESC)); // keep this for compatibility with old interface
options.addOption(new Option(CmdOptions.OUTFILE_SHORT_OPT, CmdOptions.OUTFILE_LONG_OPT, true, CmdOptions.OUTFILE_DESC));
options.addOption(new Option(CmdOptions.TRAINPROPFILE_SHORT_OPT, CmdOptions.TRAINPROPFILE_LONG_OPT, true, CmdOptions.TRAINPROPFILE_DESC));
options.addOption(new Option(CmdOptions.FORMAT_SHORT_OPT, CmdOptions.FORMAT_LONG_OPT, true, CmdOptions.FORMAT_DESC));
options.addOption(new Option(CmdOptions.GENE_SHORT_OPT, CmdOptions.GENE_LONG_OPT, true, CmdOptions.GENE_DESC));
options.addOption(new Option(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT, CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT, true, CmdOptions.MIN_WORDS_DESC));
}
/** It classifies query sequences from the input file.
* If the property file of the mapping of the training files is not null, the default property file will be override.
* The classification results will be writen to the output file.
*/
public void doClassify(String inputFile, String outFile, String propfile, ClassificationResultFormatter.FORMAT format, String gene, int min_bootstrap_words) throws IOException, TrainingDataException {
if (propfile != null) {
ClassifierFactory.setDataProp(propfile, false);
}
if (format == null) {
format = CmdOptions.DEFAULT_FORMAT;
}
ClassifierFactory factory = ClassifierFactory.getFactory(gene);
Classifier aClassifier = factory.createClassifier();
SeqReader parser = new SequenceReader(new File(inputFile));
BufferedWriter wt = new BufferedWriter(new FileWriter(outFile));
Sequence pSeq = null;
try {
while ((pSeq = parser.readNextSequence()) != null) {
try {
ClassificationResult result = aClassifier.classify(new ClassifierSequence(pSeq), min_bootstrap_words);
wt.write(ClassificationResultFormatter.getOutput(result, format));
} catch (ShortSequenceException e) {
System.out.println(e.getMessage());
} catch (Exception e) {
e.printStackTrace();
}
}
} finally {
wt.close();
}
}
/**
* Prints the license information to std err.
*/
public static void printLicense() {
String license = "Copyright 2006-2011 Michigan State University Board of Trustees.\n\n"
+ "This program is free software; you can redistribute it and/or modify it under the "
+ "terms of the GNU General Public License as published by the Free Software Foundation; "
+ "either version 2 of the License, or (at your option) any later version.\n\n"
+ "This program is distributed in the hope that it will be useful, "
+ "but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY "
+ "or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n\n"
+ "You should have received a copy of the GNU General Public License along with this program; "
+ "if not, write to the Free Software Foundation, Inc., 59 Temple Place, "
+ "Suite 330, Boston, MA 02111-1307 USA\n\n"
+ "Authors's mailng address:\n"
+ "Center for Microbial Ecology\n"
+ "2225A Biomedical Physical Science\n"
+ "Michigan State University\n"
+ "East Lansing, Michigan USA 48824-4320\n"
+ "E-mail: James R. Cole at colej@msu.edu\n"
+ "\tQiong Wang at wangqion@msu.edu\n"
+ "\tJames M. Tiedje at tiedjej@msu.edu\n\n";
System.err.println(license);
}
/**
* This is the main method to do classification.
* <p>Usage: java ClassifierCmd queryFile outputFile [property file].
* <br>
* queryFile can be one of the following formats: Fasta, Genbank and EMBL.
* <br>
* outputFile will be used to save the classification output.
* <br>
* property file contains the mapping of the training files.
* <br>
* Note: the training files and the property file should be in the same directory.
* The default property file is set to data/classifier/16srrna/rRNAClassifier.properties.
*/
public static void main(String[] args) throws Exception {
String queryFile = null;
String outputFile = null;
String propFile = null;
String gene = null;
ClassificationResultFormatter.FORMAT format = CmdOptions.DEFAULT_FORMAT;
int min_bootstrap_words = Classifier.MIN_BOOTSTRSP_WORDS;
try {
CommandLine line = new PosixParser().parse(options, args);
if (line.hasOption(CmdOptions.OUTFILE_SHORT_OPT)) {
outputFile = line.getOptionValue(CmdOptions.OUTFILE_SHORT_OPT);
} else {
throw new Exception("outputFile must be specified");
}
if (line.hasOption(CmdOptions.TRAINPROPFILE_SHORT_OPT)) {
if (gene != null) {
throw new IllegalArgumentException("Already specified the gene from the default location. Can not specify train_propfile");
} else {
propFile = line.getOptionValue(CmdOptions.TRAINPROPFILE_SHORT_OPT);
}
}
if (line.hasOption(CmdOptions.FORMAT_SHORT_OPT)) {
String f = line.getOptionValue(CmdOptions.FORMAT_SHORT_OPT);
if (f.equalsIgnoreCase("allrank")) {
format = ClassificationResultFormatter.FORMAT.allRank;
} else if (f.equalsIgnoreCase("fixrank")) {
format = ClassificationResultFormatter.FORMAT.fixRank;
} else if (f.equalsIgnoreCase("filterbyconf")) {
format = ClassificationResultFormatter.FORMAT.filterbyconf;
} else if (f.equalsIgnoreCase("db")) {
format = ClassificationResultFormatter.FORMAT.dbformat;
}else {
throw new IllegalArgumentException("Not valid output format, only allrank, fixrank, filterbyconf and db allowed");
}
}
if (line.hasOption(CmdOptions.GENE_SHORT_OPT)) {
if (propFile != null) {
throw new IllegalArgumentException("Already specified train_propfile. Can not specify gene any more");
}
gene = line.getOptionValue(CmdOptions.GENE_SHORT_OPT).toLowerCase();
if (!gene.equals(ClassifierFactory.RRNA_16S_GENE) && !gene.equals(ClassifierFactory.FUNGALLSU_GENE)) {
throw new IllegalArgumentException(gene + " is NOT valid, only allows " + ClassifierFactory.RRNA_16S_GENE + " and " + ClassifierFactory.FUNGALLSU_GENE);
}
}
if (line.hasOption(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT)) {
min_bootstrap_words = Integer.parseInt(line.getOptionValue(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT));
if (min_bootstrap_words < Classifier.MIN_BOOTSTRSP_WORDS) {
throw new IllegalArgumentException(CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT + " must be at least " + Classifier.MIN_BOOTSTRSP_WORDS);
}
}
args = line.getArgs();
if ( args.length != 1){
throw new Exception("Expect one query file");
}
queryFile = args[0];
} catch (Exception e) {
System.out.println("Command Error: " + e.getMessage());
new HelpFormatter().printHelp(120, "ClassifierCmd [options] <samplefile>\nNote this is the legacy command for one sample classification ", "", options, "");
return;
}
if (propFile == null && gene == null) {
gene = CmdOptions.DEFAULT_GENE;
}
ClassifierCmd classifierCmd = new ClassifierCmd();
printLicense();
classifierCmd.doClassify(queryFile, outputFile, propFile, format, gene, min_bootstrap_words);
}
}