package joshua.discriminative.feature_related;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import joshua.corpus.vocab.BuildinSymbol;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.hypergraph.DiskHyperGraph;
import joshua.decoder.hypergraph.HyperGraph;
import joshua.discriminative.DiscriminativeSupport;
import joshua.discriminative.FileUtilityOld;
import joshua.discriminative.feature_related.feature_template.EdgeBigramFT;
import joshua.discriminative.feature_related.feature_template.FeatureTemplate;
import joshua.discriminative.feature_related.feature_template.NgramFT;
import joshua.discriminative.feature_related.feature_template.TMFT;
public class FeatureSelectionHG {
/*for 919 sent, time_on_reading: 148797
time_on_orc_extract: 580286*/
public static void main(String[] args) {
//long start_time = System.currentTimeMillis();
if(args.length<8){
System.out.println("wrong command, correct command should be: java FeatureSelection f_l_test_items f_l_test_rules f_l_num_sents use_tm_feat use_lm_feat use_edge_ngram_only use_joint_tm_lm_feature f_feature_set_out [threshold]");
System.out.println("num of args is "+ args.length);
for(int i=0; i <args.length; i++)System.out.println("arg is: " + args[i]);
System.exit(0);
}
String f_l_test_items= args[0].trim();
String f_l_test_rules= args[1].trim();
String f_l_num_sents= args[2].trim();
boolean use_tm_feat = new Boolean(args[3].trim());
boolean use_lm_feat = new Boolean(args[4].trim());
boolean use_edge_ngram_only = new Boolean(args[5].trim());
boolean use_joint_tm_lm_feature = new Boolean(args[6].trim());
String f_feature_set_out= args[7].trim();//output file
int baseline_lm_order = new Integer(args[8].trim());
double threshold = 0.0;
if(args.length>9) threshold = new Double(args[9].trim());
boolean saveModelScore = true;//diskHG have costs stored
// ????????????????????????????????????????????????????
int ngramStateID = 0;
//??????????????????????????????????????
boolean addBaselineFeature = true;//TODO
String baselineFeatureName = "baseline_lzf";//TODO
SymbolTable p_symbol = new BuildinSymbol(null);
// ##setup feature templates list
ArrayList<FeatureTemplate> featureTemplates = new ArrayList<FeatureTemplate>();
boolean useIntegerString = false;
boolean useRuleIDName = false;
if(use_tm_feat==true){
FeatureTemplate ft = new TMFT(p_symbol, useIntegerString, useRuleIDName);
featureTemplates.add(ft);
}
if(use_lm_feat==true){
FeatureTemplate ft = new NgramFT(p_symbol, false, ngramStateID, baseline_lm_order, 1 ,2);//TODO: unigram and bi gram
featureTemplates.add(ft);
}else if(use_edge_ngram_only){//exclusive with use_lm_feat
FeatureTemplate ft = new EdgeBigramFT(p_symbol, ngramStateID, baseline_lm_order, useIntegerString);
featureTemplates.add(ft);
}
if(use_joint_tm_lm_feature){
//TODO: not implement
System.out.println("not implemented"); System.exit(0);
}
List<String> testItemsFiles = DiscriminativeSupport.readFileList(f_l_test_items);
List<String> testRulesFiles = DiscriminativeSupport.readFileList(f_l_test_rules);
List<String> l_num_sents = DiscriminativeSupport.readFileList(f_l_num_sents);
//#### extract feat tbl
HashMap<String, Double> tbl_feats = new HashMap<String, Double>();
for(int fid=0; fid < testItemsFiles.size(); fid++){
System.out.println("############Process file id " + fid);
DiskHyperGraph dhg_train = new DiskHyperGraph(p_symbol, ngramStateID, saveModelScore, null);
dhg_train.initRead((String)testItemsFiles.get(fid), (String)testRulesFiles.get(fid),null);
int total_num_sent = new Integer((String)l_num_sents.get(fid));
for(int sent_id=0; sent_id < total_num_sent; sent_id ++){
System.out.println("############Process sentence " + sent_id);
HyperGraph hg_train = dhg_train.readHyperGraph();
FeatureExtractionHG.featureExtractionOnHG(hg_train, tbl_feats, null, featureTemplates);
}
}
System.out.println("===feature table size is " + tbl_feats.size());
//#### write hashtable
boolean useZeroValue = true;
boolean keyOnly = false;
FileUtilityOld.printHashTblAboveThreshold(tbl_feats, f_feature_set_out, keyOnly, threshold, useZeroValue, addBaselineFeature, baselineFeatureName);
}
}