package gannuWSD.algorithms;
import gannuNLP.data.AmbiguousWord;
import gannuNLP.data.Input;
import gannuNLP.data.Sense;
import gannuNLP.dictionaries.Dictionary;
import gannuUtil.Util;
import gannuWSD.testing.Decision;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import weka.attributeSelection.ASEvaluation;
import weka.attributeSelection.ASSearch;
import weka.classifiers.Classifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.supervised.attribute.AttributeSelection;
/**
* Interface to Weka for supervised WSD.
* @author Francisco Viveros-Jiménez
*
*/
public class Weka extends WSDAlgorithm {
public Weka()
{
super();
this.name="Weka";
}
/**
* Name for Weka class of the selected classifier
*/
String classifier;
/**
* Creates the Weka samples (Instances object) corresponding a target word.
* @param target Target word.
* @return Instances object (Weka samples).
* @throws Exception
*/
Instances loadSamples(AmbiguousWord target)throws Exception
{
System.out.println("Generating training set");
String lemma=target.getLemma();
File f=new File("data/weka/"+Dictionary.normalizeLemmaforFile(lemma)+".ins");
Instances ins=null;
if(f.exists())
{
ins=(Instances)Util.loadObject(f);
}
else
{
ArrayList<String> features=new ArrayList<String>();
int totalsamples=0;
for(Sense sense:target.getSenses())
{
totalsamples+=sense.getSamples().size();
features.addAll(sense.getBagOfWords());
}
features=Util.removeDuplicates(features);
features.trimToSize();
Collections.sort(features);
short samples[][]=new short[totalsamples][features.size()+1];
for(int i=0;i<totalsamples;i++)
for(int j=0;j<features.size();j++)
samples[i][j]=0;
int i=0;
FastVector fvClassVal = new FastVector(target.getSenses().size());
short isense=0;
for(Sense sense:target.getSenses())
{
fvClassVal.addElement(String.valueOf(isense));
for(ArrayList<String> sample:sense.getParsedSamples())
{
for(String word:sample)
{
samples[i][Collections.binarySearch(features, word)]++;
}
samples[i][features.size()]=isense;
i++;
}
isense++;
}
//Build the classifier
//Generating attributes
FastVector fvWekaAttributes = new FastVector(features.size()+1);
for(String feature:features)
{
fvWekaAttributes.addElement(new Attribute(feature));
}
fvWekaAttributes.addElement(new Attribute("Sense",fvClassVal));
//Generating the class attribute
// Create an empty training set
ins = new Instances(target.getLemma(), fvWekaAttributes, totalsamples);
// Set class index
ins.setClassIndex(features.size());
// Generating examples
i=0;
for(i=0;i<totalsamples;i++)
{
System.out.println("Loading sample "+i);
Instance iExample = new Instance(features.size()+1);
iExample.setDataset(ins);
for(int j=0;j<features.size();j++)
iExample.setValue(j, samples[i][j]);
iExample.setClassValue(String.valueOf(samples[i][features.size()]));
ins.add(iExample);
}
Util.writeObject(f, ins);
}
return ins;
}
/**
* Creates the Weka samples (Instances object) corresponding to a target word.
* This samples are going to be filtered by using the Evaluator and Search classes
* as specified in the evaluator and search parameters.
* @param target Target word.
* @return Instances object filtered with the Evaluator and Search classes.
* @throws Exception
*/
Instances loadFilteredInstances(AmbiguousWord target)throws Exception
{
String Evaluator=this.getValue("evaluator");
String Search=this.getValue("search");
Instances ins=null;
if(Evaluator!=null&&Search!=null)
{
System.out.println("Filtering attributes");
File f=new File("data/weka/"+Dictionary.normalizeLemmaforFile(target.getLemma())+"@"+Evaluator+"@"+"Search"+".ins");
if(f.exists())
{
ins=(Instances)Util.loadObject(f);
}
else
{
ins=this.loadSamples(target);
AttributeSelection filter= new AttributeSelection();
filter.setEvaluator((ASEvaluation)Class.forName(Evaluator).newInstance());
filter.setSearch((ASSearch)Class.forName(Search).newInstance());
filter.setInputFormat(ins);
ins=Filter.useFilter(ins, (Filter)filter);
Util.writeObject(f, ins);
}
}
else
{
ins=this.loadSamples(target);
}
return ins;
}
@Override
public Decision disambiguate(AmbiguousWord target,
ArrayList<AmbiguousWord> window) throws Exception{
if(this.classifier==null)
this.classifier=this.getValue("classifier");
Decision decision=new Decision(target,window);
if(target.getSenses().size()>1)
{
Instances ins=this.loadFilteredInstances(target);
System.out.println("Trying "+target.getIndex());
Classifier cModel=null;
String Evaluator=this.getValue("evaluator");
String Search=this.getValue("search");
if(Evaluator==null)
Evaluator="";
if(Search==null)
Search="";
File f=new File("data/weka/"+this.classifier+"@"+Dictionary.normalizeLemmaforFile(target.getLemma())+"@"+Evaluator+"@"+Search+".csf");
if(f.exists())
{
cModel=(Classifier)Util.loadObject(f);
}
else
{
cModel = (Classifier)Class.forName(this.classifier).newInstance();
cModel.buildClassifier(ins);
Util.writeObject(f, cModel);
}
//Instantiate the classifier
Instance base=ins.firstInstance();
Instance sample=new Instance(base.numAttributes());
sample.setDataset(ins);
int c[]=new int[base.numAttributes()];
for(int i=0;i<base.numAttributes();i++)
{
c[i]=0;
}
for(int i=0;i<base.numAttributes();i++)
{
Attribute att=base.attribute(i);
for(AmbiguousWord word:window)
{
if(word.getLemma().equals(att.name()))
{
c[i]++;
}
}
}
for(int i=0;i<base.numAttributes();i++)
{
sample.setValue(i, ((double)c[i]));
}
double []w=cModel.distributionForInstance(sample);
for(int j=0;j<target.getSenses().size();j++)
{
ArrayList<String> dwords=new ArrayList<String>(window.size());
for(AmbiguousWord word:window)
{
if(this.overlap(target.getSenses().get(j), word.getLemma()))
dwords.add(word.getLemma());
}
decision.setSense(j, w[j], dwords);
}
}
else
decision.setSense(0, 0.1, new ArrayList<String>());
decision.calculateAnswer();
return decision;
}
@Override
public void init(Input document) {
// TODO Auto-generated method stub
}
@Override
public boolean IsUseful(AmbiguousWord target, AmbiguousWord windowWord)
throws Exception {
for(Sense s:target.getSenses())
for(String word:s.getBagOfWords())
if(word.equals(windowWord.getLemma()))
return true;
return false;
}
}