Package dkpro.similarity.algorithms.lexsub

Source Code of dkpro.similarity.algorithms.lexsub.TWSISubstituteWrapper

package dkpro.similarity.algorithms.lexsub;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.lang.NotImplementedException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;

import de.tudarmstadt.langtech.substituter.MLSenseSubstituter;
import de.tudarmstadt.langtech.substituter.SenseSubstituter;
import de.tudarmstadt.langtech.substituter.Substitution;
import de.tudarmstadt.ukp.dkpro.core.api.resources.DkproContext;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import dkpro.similarity.algorithms.api.JCasTextSimilarityMeasureBase;
import dkpro.similarity.algorithms.api.SimilarityException;
import dkpro.similarity.algorithms.api.TextSimilarityMeasure;


/**
* Similarity measure based on the lexical substitution system
* based on supervised word sense disambiguation
* <a href="http://link.springer.com/article/10.1007%2Fs10579-012-9180-5">(Biemann, 2012)</a>.
*/
public class TWSISubstituteWrapper
  extends JCasTextSimilarityMeasureBase
{
  SenseSubstituter sensub;
  TextSimilarityMeasure measure;
 
  public TWSISubstituteWrapper(TextSimilarityMeasure measure)
    throws IOException
  {   
    this.sensub = new MLSenseSubstituter(DkproContext.getContext().getWorkspace() + "/TWSI2/conf/TWSI2_config.conf");
    this.measure = measure;
  }
 
  @Override
  public double getSimilarity(JCas jcas1, JCas jcas2)
    throws SimilarityException
  {
    List<String> subst1 = getSubstitutions(jcas1);
    List<String> subst2 = getSubstitutions(jcas2);
   
    return measure.getSimilarity(subst1, subst2);
  }
 
    @Override
    public double getSimilarity(JCas jcas1, JCas jcas2, Annotation coveringAnnotation1,
            Annotation coveringAnnotation2)
        throws SimilarityException
    {
        List<String> subst1 = getSubstitutions(jcas1, coveringAnnotation1);
        List<String> subst2 = getSubstitutions(jcas2, coveringAnnotation2);
       
        return measure.getSimilarity(subst1, subst2);
    }
  
  public List<String> getSubstitutions(JCas jcas)
  {
    List<String> tokens = new ArrayList<String>();
    List<String> postags = new ArrayList<String>();;
   
    for (Token t : JCasUtil.select(jcas, Token.class))
    {
      try
      {
        tokens.add(t.getLemma().getValue().toLowerCase());
        postags.add(t.getPos().getPosValue());
      }
      catch (NullPointerException e) {
        System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\"");
      }
    }
   
    return getSubstitutions(tokens, postags);
  }
 
    public List<String> getSubstitutions(JCas jcas, Annotation coveringAnnotation)
    {
        List<String> tokens = new ArrayList<String>();
        List<String> postags = new ArrayList<String>();;
       
        for (Token t : JCasUtil.selectCovered(jcas, Token.class, coveringAnnotation))
        {
            try
            {
                tokens.add(t.getLemma().getValue().toLowerCase());
                postags.add(t.getPos().getPosValue());
            }
            catch (NullPointerException e) {
                System.err.println("Couldn't read lemma value for token \"" + t.getCoveredText() + "\"");
            }
        }
       
        return getSubstitutions(tokens, postags);
    }

    public List<String> getSubstitutions(List<String> tokens, List<String> postags)
  { 
    // Append BOS + EOS tags
    tokens.add(0, "%^%");
    postags.add(0, "BOS");
    tokens.add("%$%");
    postags.add("EOS");   
       
    // Sense substitutor operates on arrays
    String[] tokenArray = tokens.toArray(new String[tokens.size()]);
    String[] postagsArray = postags.toArray(new String[postags.size()]);
   
    List<String> resultList = new ArrayList<String>();
   
    for (int i = 0; i < tokens.size(); i++)
    {
      // System.out.println(postags.get(i) + " / " + tokens.get(i));
     
      // TWSI only operates on nouns
      if (postags.get(i).startsWith("NN"))
      {
        try
        {
          Substitution subst = sensub.getSubstitution(i, tokenArray, postagsArray);
         
          if (subst != null)
          {
            for (String[] substitution : subst.getSubstitutions())
            {
              //resultList.add(subst.getSense().replaceAll("@@", ""));
              resultList.add(substitution[0]);
            }
          }
          else
          {
            resultList.add(tokens.get(i));
          }
        }
        catch (InstantiationError e)
        {
          resultList.add(tokens.get(i));
        }
      } else {
        resultList.add(tokens.get(i));
      }
    }
   
    // Remove BOF + EOF words
    resultList.remove(0);
    resultList.remove(resultList.size() - 1);
   
    System.out.println(resultList);
   
    return resultList;
  }

  @Override
  public double getSimilarity(Collection<String> stringList1,
      Collection<String> stringList2)
    throws SimilarityException
  {
    throw new SimilarityException(new NotImplementedException());
  }
 
  @Override
  public String getName()
  {
    return this.getClass().getSimpleName() + "_" + measure.getName();
  }

}
TOP

Related Classes of dkpro.similarity.algorithms.lexsub.TWSISubstituteWrapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.