/*
* Copyright (C) 2012 Michigan State University <rdpstaff at msu.edu>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.msu.cme.rdp.multicompare;
import edu.msu.cme.rdp.classifier.Classifier;
import edu.msu.cme.rdp.classifier.ClassifierCmd;
import edu.msu.cme.rdp.classifier.cli.CmdOptions;
import edu.msu.cme.rdp.classifier.io.ClassificationResultFormatter;
import edu.msu.cme.rdp.classifier.utils.ClassifierFactory;
import edu.msu.cme.rdp.multicompare.taxon.MCTaxon;
import edu.msu.cme.rdp.multicompare.visitors.DefaultPrintVisitor;
import edu.msu.cme.rdp.taxatree.ConcretRoot;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.output.NullWriter;
/**
*
* @author fishjord
*/
public class Main {
private static final Options options = new Options();
static {
options.addOption(new Option(CmdOptions.QUERYFILE_SHORT_OPT, CmdOptions.QUERYFILE_LONG_OPT, false, CmdOptions.QUERYFILE_DESC));
options.addOption(new Option(CmdOptions.OUTFILE_SHORT_OPT, CmdOptions.OUTFILE_LONG_OPT, true, CmdOptions.OUTFILE_DESC));
options.addOption(new Option(CmdOptions.TRAINPROPFILE_SHORT_OPT, CmdOptions.TRAINPROPFILE_LONG_OPT, true, CmdOptions.TRAINPROPFILE_DESC));
options.addOption(new Option(CmdOptions.FORMAT_SHORT_OPT, CmdOptions.FORMAT_LONG_OPT, true, CmdOptions.FORMAT_DESC));
options.addOption(new Option(CmdOptions.GENE_SHORT_OPT, CmdOptions.GENE_LONG_OPT, true, CmdOptions.GENE_DESC));
options.addOption(new Option(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT, CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT, true, CmdOptions.MIN_WORDS_DESC));
options.addOption(new Option(CmdOptions.HIER_OUTFILE_SHORT_OPT, CmdOptions.HIER_OUTFILE_LONG_OPT, true, CmdOptions.HIER_OUTFILE_DESC));
options.addOption(new Option(CmdOptions.BOOTSTRAP_SHORT_OPT, CmdOptions.BOOTSTRAP_LONG_OPT, true, CmdOptions.BOOTSTRAP_DESC));
options.addOption(new Option(CmdOptions.BOOTSTRAP_OUTFILE_SHORT_OPT, CmdOptions.BOOTSTRAP_OUTFILE_LONG_OPT, true, CmdOptions.BOOTSTRAP_OUTFILE_DESC));
options.addOption(new Option(CmdOptions.SHORTSEQ_OUTFILE_SHORT_OPT, CmdOptions.SHORTSEQ_OUTFILE_LONG_OPT, true, CmdOptions.SHORTSEQ_OUTFILE_DESC));
options.addOption(new Option(CmdOptions.BIOMFILE_SHORT_OPT, CmdOptions.BIOMFILE_LONG_OPT, true, CmdOptions.BIOMFILE_DESC));
options.addOption(new Option(CmdOptions.METADATA_SHORT_OPT, CmdOptions.METADATA_LONG_OPT, true, CmdOptions.METADATA_DESC));
}
public static void printResults(ConcretRoot<MCTaxon> root, List<MCSample> samples, PrintStream heirOut, PrintStream bootstrapOut) throws IOException {
DefaultPrintVisitor printVisitor = new DefaultPrintVisitor(heirOut, samples);
root.topDownVisit(printVisitor);
for (MCSample sample : samples) {
MCSamplePrintUtil.printBootstrapCountTable(bootstrapOut, sample);
}
}
public static Map<String, String> readSampleMapping(String file) throws IOException{
Map<String, String> ret = new HashMap();
BufferedReader reader = new BufferedReader(new FileReader(file));
String line;
while((line = reader.readLine()) != null) {
if(line.trim().equals("")) continue;
String seqid = line.split("\t")[0].trim();
String sample = line.split("\t")[1].trim();
ret.put(seqid, sample);
}
reader.close();
return ret;
}
public static Map<String, Integer> readReplicateMapping(String file) throws IOException {
Map<String, Integer> ret = new HashMap();
BufferedReader reader = new BufferedReader(new FileReader(file));
String line;
while((line = reader.readLine()) != null) {
if(line.trim().equals("")) continue;
String seqid = line.split("\t")[0].trim();
int replicates = Integer.valueOf(line.split("\t")[1].trim());
ret.put(seqid, replicates);
}
reader.close();
return ret;
}
public static void main(String[] args) throws Exception {
PrintStream hier_out = null;
PrintWriter assign_out = new PrintWriter(new NullWriter());
PrintStream bootstrap_out = null;
File hier_out_filename = null;
String propFile = null;
File biomFile = null;
File metadataFile = null;
PrintWriter shortseq_out = null;
List<MCSample> samples = new ArrayList();
ClassificationResultFormatter.FORMAT format = ClassificationResultFormatter.FORMAT.allRank;
float conf = CmdOptions.DEFAULT_CONF;
String gene = null;
int min_bootstrap_words = Classifier.MIN_BOOTSTRSP_WORDS;
try {
CommandLine line = new PosixParser().parse(options, args);
if (line.hasOption(CmdOptions.OUTFILE_SHORT_OPT)) {
assign_out = new PrintWriter(line.getOptionValue(CmdOptions.OUTFILE_SHORT_OPT));
} else {
throw new IllegalArgumentException("Require the output file for classification assignment" );
}
if (line.hasOption(CmdOptions.HIER_OUTFILE_SHORT_OPT)) {
hier_out_filename = new File(line.getOptionValue(CmdOptions.HIER_OUTFILE_SHORT_OPT));
hier_out = new PrintStream(hier_out_filename);
}
if (line.hasOption(CmdOptions.BIOMFILE_SHORT_OPT)) {
biomFile = new File(line.getOptionValue(CmdOptions.BIOMFILE_SHORT_OPT));
}
if (line.hasOption(CmdOptions.METADATA_SHORT_OPT)) {
metadataFile = new File(line.getOptionValue(CmdOptions.METADATA_SHORT_OPT));
}
if (line.hasOption(CmdOptions.TRAINPROPFILE_SHORT_OPT)) {
if (gene != null) {
throw new IllegalArgumentException("Already specified the gene from the default location. Can not specify train_propfile");
} else {
propFile = line.getOptionValue(CmdOptions.TRAINPROPFILE_SHORT_OPT);
}
}
if (line.hasOption(CmdOptions.FORMAT_SHORT_OPT)) {
String f = line.getOptionValue(CmdOptions.FORMAT_SHORT_OPT);
if (f.equalsIgnoreCase("allrank")) {
format = ClassificationResultFormatter.FORMAT.allRank;
} else if (f.equalsIgnoreCase("fixrank")) {
format = ClassificationResultFormatter.FORMAT.fixRank;
} else if (f.equalsIgnoreCase("filterbyconf")) {
format = ClassificationResultFormatter.FORMAT.filterbyconf;
} else if (f.equalsIgnoreCase("db")) {
format = ClassificationResultFormatter.FORMAT.dbformat;
} else if (f.equalsIgnoreCase("biom")) {
format = ClassificationResultFormatter.FORMAT.biom;
}else {
throw new IllegalArgumentException("Not an valid output format, only allrank, fixrank, biom, filterbyconf and db allowed");
}
}
if (line.hasOption(CmdOptions.GENE_SHORT_OPT)) {
if (propFile != null) {
throw new IllegalArgumentException("Already specified train_propfile. Can not specify gene any more");
}
gene = line.getOptionValue(CmdOptions.GENE_SHORT_OPT).toLowerCase();
if (!gene.equals(ClassifierFactory.RRNA_16S_GENE) && !gene.equals(ClassifierFactory.FUNGALLSU_GENE)
&& !gene.equals(ClassifierFactory.FUNGALITS_warcup_GENE) && !gene.equals(ClassifierFactory.FUNGALITS_unite_GENE) ) {
throw new IllegalArgumentException(gene + " not found, choose from" + ClassifierFactory.RRNA_16S_GENE + ", "
+ ClassifierFactory.FUNGALLSU_GENE + ", " + ClassifierFactory.FUNGALITS_warcup_GENE + ", " + ClassifierFactory.FUNGALITS_unite_GENE);
}
}
if (line.hasOption(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT)) {
min_bootstrap_words = Integer.parseInt(line.getOptionValue(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT));
if (min_bootstrap_words < Classifier.MIN_BOOTSTRSP_WORDS) {
throw new IllegalArgumentException(CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT + " must be at least " + Classifier.MIN_BOOTSTRSP_WORDS);
}
}
if (line.hasOption(CmdOptions.BOOTSTRAP_SHORT_OPT)) {
String confString = line.getOptionValue(CmdOptions.BOOTSTRAP_SHORT_OPT);
try {
conf = Float.valueOf(confString);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Confidence must be a decimal number");
}
if (conf < 0 || conf > 1) {
throw new IllegalArgumentException("Confidence must be in the range [0,1]");
}
}
if (line.hasOption(CmdOptions.SHORTSEQ_OUTFILE_SHORT_OPT)) {
shortseq_out = new PrintWriter(line.getOptionValue(CmdOptions.SHORTSEQ_OUTFILE_SHORT_OPT));
}
if (line.hasOption(CmdOptions.BOOTSTRAP_OUTFILE_SHORT_OPT)) {
bootstrap_out = new PrintStream(line.getOptionValue(CmdOptions.BOOTSTRAP_OUTFILE_SHORT_OPT));
}
if ( format.equals( ClassificationResultFormatter.FORMAT.biom) && biomFile == null){
throw new IllegalArgumentException("biom format requires an input biom file");
}
if ( biomFile != null ) { // if input biom file provided, use biom format
format = ClassificationResultFormatter.FORMAT.biom;
}
args = line.getArgs();
for ( String arg: args){
String[] inFileNames = arg.split(",");
File inputFile = new File(inFileNames[0]);
File idmappingFile = null;
if (!inputFile.exists()) {
throw new IllegalArgumentException("Failed to find input file \"" + inFileNames[0] + "\"");
}
if (inFileNames.length == 2) {
idmappingFile = new File(inFileNames[1]);
if (!idmappingFile.exists()) {
throw new IllegalArgumentException("Failed to find input file \"" + inFileNames[1] + "\"");
}
}
MCSample nextSample = new MCSample(inputFile, idmappingFile);
samples.add(nextSample);
}
if ( propFile == null && gene == null){
gene = CmdOptions.DEFAULT_GENE;
}
if (samples.size() < 1) {
throw new IllegalArgumentException("Require at least one sample files");
}
}catch (Exception e) {
System.out.println("Command Error: " + e.getMessage());
new HelpFormatter().printHelp(80, " [options] <samplefile>[,idmappingfile] ...", "", options, "");
return;
}
MultiClassifier multiClassifier = new MultiClassifier(propFile, gene, biomFile, metadataFile);
MultiClassifierResult result = multiClassifier.multiCompare(samples, conf, assign_out, format, min_bootstrap_words);
assign_out.close();
if ( hier_out != null){
DefaultPrintVisitor printVisitor = new DefaultPrintVisitor(hier_out, samples);
result.getRoot().topDownVisit(printVisitor);
hier_out.close();
if ( multiClassifier.hasCopyNumber()){
// print copy number corrected counts
File cn_corrected_s = new File (hier_out_filename.getParentFile(), "cnadjusted_" + hier_out_filename.getName());
PrintStream cn_corrected_hier_out = new PrintStream(cn_corrected_s);
printVisitor = new DefaultPrintVisitor(cn_corrected_hier_out, samples, true);
result.getRoot().topDownVisit(printVisitor);
cn_corrected_hier_out.close();
}
}
if ( bootstrap_out != null){
for (MCSample sample : samples) {
MCSamplePrintUtil.printBootstrapCountTable(bootstrap_out, sample);
}
bootstrap_out.close();
}
if ( shortseq_out != null){
for (String id: result.getBadSequences()){
shortseq_out.write(id +"\n");
}
shortseq_out.close();
}
}
}