Package gannuWSD.algorithms

Source Code of gannuWSD.algorithms.Weka

package gannuWSD.algorithms;

import gannuNLP.data.AmbiguousWord;
import gannuNLP.data.Input;
import gannuNLP.data.Sense;
import gannuNLP.dictionaries.Dictionary;
import gannuUtil.Util;
import gannuWSD.testing.Decision;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;

import weka.attributeSelection.ASEvaluation;
import weka.attributeSelection.ASSearch;
import weka.classifiers.Classifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.supervised.attribute.AttributeSelection;


/**
* Interface to Weka for supervised WSD.
* @author Francisco Viveros-Jiménez
*
*/
public class Weka extends WSDAlgorithm {
 
  public Weka()
  {
    super();
    this.name="Weka";
  }
  /**
   * Name for Weka class of the selected classifier
   */
  String classifier;
 
  /**
   * Creates the Weka samples (Instances object) corresponding a target word.
   * @param target Target word.
   * @return Instances object (Weka samples).
   * @throws Exception
   */
  Instances loadSamples(AmbiguousWord target)throws Exception
  {
    System.out.println("Generating training set");
    String lemma=target.getLemma();
    File f=new File("data/weka/"+Dictionary.normalizeLemmaforFile(lemma)+".ins");
    Instances ins=null;
    if(f.exists())
    {
      ins=(Instances)Util.loadObject(f);     
    }
    else
    {
      ArrayList<String> features=new ArrayList<String>();
      int totalsamples=0;
      for(Sense sense:target.getSenses())
      {
        totalsamples+=sense.getSamples().size();
        features.addAll(sense.getBagOfWords());
      }
      features=Util.removeDuplicates(features);
      features.trimToSize();
      Collections.sort(features);
      short samples[][]=new short[totalsamples][features.size()+1];
      for(int i=0;i<totalsamples;i++)
        for(int j=0;j<features.size();j++)
          samples[i][j]=0;
      int i=0;
      FastVector fvClassVal = new FastVector(target.getSenses().size());
      short isense=0;
      for(Sense sense:target.getSenses())
      {
        fvClassVal.addElement(String.valueOf(isense));
        for(ArrayList<String> sample:sense.getParsedSamples())
        {
          for(String word:sample)
          {
            samples[i][Collections.binarySearch(features, word)]++;
          }
          samples[i][features.size()]=isense;
          i++;
        }
        isense++;
      }
      //Build the classifier
      //Generating attributes
      FastVector fvWekaAttributes = new FastVector(features.size()+1);     
      for(String feature:features)
      {
        fvWekaAttributes.addElement(new Attribute(feature));
      }
      fvWekaAttributes.addElement(new Attribute("Sense",fvClassVal));
     
      //Generating the class attribute
       // Create an empty training set
       ins = new Instances(target.getLemma(), fvWekaAttributes,  totalsamples);
      
       // Set class index
       ins.setClassIndex(features.size());
       // Generating examples
       i=0;
       for(i=0;i<totalsamples;i++)
       {
         System.out.println("Loading sample "+i);
         Instance iExample = new Instance(features.size()+1);        
         iExample.setDataset(ins);
         for(int j=0;j<features.size();j++)
           iExample.setValue(j, samples[i][j]);
         iExample.setClassValue(String.valueOf(samples[i][features.size()]));
         ins.add(iExample);
       }
       Util.writeObject(f, ins);
    }
    return ins;
  }

  /**
   * Creates the Weka samples (Instances object) corresponding to a target word.  
   * This samples are going to be filtered by using the Evaluator and Search classes
   * as specified in the evaluator and search parameters.
   * @param target Target word.
   * @return Instances object filtered with the Evaluator and Search classes.
   * @throws Exception
   */
  Instances loadFilteredInstances(AmbiguousWord target)throws Exception
  {   
    String Evaluator=this.getValue("evaluator");
    String Search=this.getValue("search");
    Instances ins=null;
    if(Evaluator!=null&&Search!=null)
    {
      System.out.println("Filtering attributes");
      File f=new File("data/weka/"+Dictionary.normalizeLemmaforFile(target.getLemma())+"@"+Evaluator+"@"+"Search"+".ins");
      if(f.exists())
      {       
        ins=(Instances)Util.loadObject(f);     
      }
      else
      {
        ins=this.loadSamples(target);
        AttributeSelection filter= new AttributeSelection();
        filter.setEvaluator((ASEvaluation)Class.forName(Evaluator).newInstance());
        filter.setSearch((ASSearch)Class.forName(Search).newInstance());
        filter.setInputFormat(ins);
        ins=Filter.useFilter(ins, (Filter)filter);
        Util.writeObject(f, ins);
      }
      
    }
    else
    {
      ins=this.loadSamples(target);
    }
    return ins;
  }
 

  @Override
  public Decision disambiguate(AmbiguousWord target,
      ArrayList<AmbiguousWord> window) throws Exception{
   
    if(this.classifier==null)
      this.classifier=this.getValue("classifier");
    Decision decision=new Decision(target,window);
        
      if(target.getSenses().size()>1)
    {
        Instances ins=this.loadFilteredInstances(target);
      System.out.println("Trying "+target.getIndex());
      Classifier cModel=null;
      String Evaluator=this.getValue("evaluator");
      String Search=this.getValue("search");
      if(Evaluator==null)
        Evaluator="";
      if(Search==null)
        Search="";
      File f=new File("data/weka/"+this.classifier+"@"+Dictionary.normalizeLemmaforFile(target.getLemma())+"@"+Evaluator+"@"+Search+".csf");
      if(f.exists())
      {
        cModel=(Classifier)Util.loadObject(f);
      }
      else
      {
        cModel = (Classifier)Class.forName(this.classifier).newInstance();       
             cModel.buildClassifier(ins);
             Util.writeObject(f, cModel);
      }
      //Instantiate the classifier
        Instance base=ins.firstInstance();
      Instance sample=new Instance(base.numAttributes());
      sample.setDataset(ins);
      int c[]=new int[base.numAttributes()];
      for(int i=0;i<base.numAttributes();i++)
      {
        c[i]=0;
      }
      for(int i=0;i<base.numAttributes();i++)
      {
        Attribute att=base.attribute(i);       
        for(AmbiguousWord word:window)
         {
           if(word.getLemma().equals(att.name()))
           {
             c[i]++; 
           }
         }
      }
      for(int i=0;i<base.numAttributes();i++)
      {
        sample.setValue(i, ((double)c[i]));
      }
     
      double []w=cModel.distributionForInstance(sample);
      for(int j=0;j<target.getSenses().size();j++)
      {
        ArrayList<String> dwords=new ArrayList<String>(window.size());
        for(AmbiguousWord word:window)
        {
          if(this.overlap(target.getSenses().get(j), word.getLemma()))
            dwords.add(word.getLemma());
        }
        decision.setSense(j, w[j], dwords);
      }
    }
    else
       decision.setSense(0, 0.1, new ArrayList<String>());
    decision.calculateAnswer();
    return decision;
  }
  @Override
  public void init(Input document) {
    // TODO Auto-generated method stub
   
  }
  @Override
  public boolean IsUseful(AmbiguousWord target, AmbiguousWord windowWord)
      throws Exception {
        for(Sense s:target.getSenses())
          for(String word:s.getBagOfWords())
            if(word.equals(windowWord.getLemma()))
              return true;
    return false;
  }


}
TOP

Related Classes of gannuWSD.algorithms.Weka

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.