package gannuWSD.bowmodifiers;
import gannuNLP.corpus.ContainsLemmaFilter;
import gannuNLP.corpus.Corpus;
import gannuNLP.corpus.WSM;
import gannuNLP.data.Input;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.keywordextraction.KeywordExtractor;
import gannuNLP.keywordextraction.KeywordsByTFIDF;
import java.util.ArrayList;
/**
* Class for adding all the keywords of the documents related to a concept in a corpus.
* Please specify the corpus through the corpus parameter.
* @author Francisco Viveros-Jiménez.
*/
public class AddCorpusRelatedLemmas extends BoWModifier {
/**
* The base corpus
*/
static Corpus corpus;
KeywordExtractor kw;
private static final long serialVersionUID = 1L;
/**
* Instantiate this filter.
*/
public AddCorpusRelatedLemmas() {
super("AddCorpusRelatedLemmas");
}
@Override
public void init()throws Exception {
AddCorpusRelatedLemmas.corpus=new Corpus(this.getValue("corpus"),this.dict,true);
this.kw=new KeywordsByTFIDF();
}
@Override
/**
* This method removes all the duplicated words from all the bag of words of a target lemma.
* @param lemma The target lemma.
*/
public void modifyBow(Lemma lemma) throws Exception {
ContainsLemmaFilter filter=new ContainsLemmaFilter("");
Corpus aux=new Corpus(corpus);
filter.filter(aux, lemma);
ArrayList<WSM> wsm=new ArrayList<WSM>();
for(Input document:aux.getDocuments())
{
wsm.addAll(this.kw.extractKeywords(document, 10, true));
}
for(Sense s:lemma.getSenses())
{
for(WSM word:wsm)
{
s.getBagOfWords().add(word.getDimension());
}
}
}
}