WordAndPOSDictionaryLabellingStrategy tagger = new WordAndPOSDictionaryLabellingStrategy(
wd,
pd,
(options.has("K") ? options.valueOf(kspec).intValue() : 20),
maxentModel = new ZLMEM(options.valueOf(modspec)),
new STFex(stPrior),
seqMod,
alg);
tagger.setMaxSearchBeam(fbWidth);
maxentModel.verbose = true;
Iterator<List<Word>> corpus = null;
Iterator<List<Word>> goldCorpus = null;
if(options.valueOf(tokenisation).equalsIgnoreCase("srilm")) {
corpus = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(inputspec))));
} else if(options.valueOf(tokenisation).equalsIgnoreCase("candc")) {
corpus = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(inputspec))));
}
if(options.has("test") && options.valueOf(tokenisation).equalsIgnoreCase("srilm")) {
goldCorpus = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(goldstandspec))));
} else if(options.has("test") && options.valueOf(tokenisation).equalsIgnoreCase("candc")) {
goldCorpus = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(goldstandspec))));
}
BufferedWriter outf = new BufferedWriter(new FileWriter(options.valueOf(outputspec)));
boolean test = options.has("test");
ResultSink results = new ResultSink();
int sentCnt = 0;
tagger.setBetas(new double[] {beta});
while(corpus.hasNext()) {
sentCnt++;
List<Word> sent = corpus.next();
List<List<Pair<Double,String>>> taggings = tagger.multitag(sent, beta);
if(test) {
List<Word> goldsent = goldCorpus.next();
results.addSent(taggings, goldsent);
}
Iterator<Word> sentiter = sent.iterator();
// output file format = word goldtag tag1 ... tagK
outf.write("<s>"+System.getProperty("line.separator"));
for(List<Pair<Double,String>> tagging : taggings) {
Word nextw = sentiter.next();
outf.write(nextw.getForm() + "\t1\t" + nextw.getPOS() + "\t1.0\t" + tagging.size() + "\t");// + nextw.getSupertag() + " ");
//outf.write(nextw.getForm() + "|||"+ nextw.getStem() + "|||" + nextw.getPOS() + "|||");
String tags = "";
for(Pair<Double,String> tg : tagging) {
//tags+="^"+tg.b+":"+tg.a;
tags+= "\t" + tg.b + "\t"+tg.a;
}
// write out the multitagging, minus the initial space (tab).
outf.write(tags.substring(1) + System.getProperty("line.separator"));
//// write out the multitagging, minus the initial ^.
//outf.write(tags.substring(1) + " ");
}
outf.write("</s>"+System.getProperty("line.separator"));
if(sentCnt % 10 == 0) {
outf.flush();
}
}
outf.flush();
outf.close();
if(test) {
System.err.println(results.report());
}
long end = System.currentTimeMillis();
System.err.println("Time to tag: " + ((end - start + 0.0)/1000) + " seconds.");
} else if (options.has("tagdictextract")) {
// extract tagging dictionaries.
File wd = options.valueOf(wdictspec);
File pd = options.valueOf(pdictspec);
File inf = options.valueOf(inputspec);
TaggingDictionaryExtractor tde = new TaggingDictionaryExtractor(inf,wd,pd,options.valueOf(tokenisation));
System.err.println("Extracting dictionaries from: "+inf.toString()+" into files: "+wd.toString()+" and: "+pd.toString()+"\n(wdict and posdict, resp.).");
tde.extract();
} else {
// train (extract features).
File inf = options.valueOf(inputspec);
File outf = options.valueOf(outputspec);
FeatureExtractor fexer = (stPrior == null) ? new STFex() : new STFex(stPrior);
ZhangLeTrainingExtractor fexApp = new ZhangLeTrainingExtractor(inf, outf, options.valueOf(tokenisation), fexer);
System.err.println("Extracting features from file: " + inf.toString() + ", and placing extracted features in: " + outf.toString() + ".");
fexApp.writeFeats();
}