package gannuNLP.corpus;
import gannuNLP.data.AmbiguousWord;
import gannuNLP.data.Input;
import gannuNLP.data.Lemma;
import gannuNLP.dictionaries.Dictionary;
import gannuNLP.dictionaries.Wiki;
import gannuUtil.Util;
import java.io.File;
import java.util.ArrayList;
/**
* Class for extracting stats from wikipedia.
* @author Francisco Viveros-Jiménez
*
*/
public class WikiCorpus extends Corpus{
/**
* Instantiates a WikiCorpus for extracting statistics for all the lemmas of the target documents
* @param targets List with containing the target documents.
* @param name The name of this corpus.
* @param dict Base dictionary. It has to be an instanceOf a Wiki connector.
* @throws Exception
*/
public WikiCorpus(ArrayList<Input> targets, String name, Dictionary dict) throws Exception
{
super(dict,name);
this.dict=dict;
File tmp=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/");
tmp.mkdirs();
Wiki wiki=(Wiki)this.dict;
for(Input doc:targets)
{
System.out.println("Calculating matrices for "+doc.toString());
for(AmbiguousWord word:doc.getAmbiguousWords())
{
if(word.getCurrentLemmaObject()!=null&&word.getSenses().size()>0&&word.getSenses().size()<150)
{
System.out.print(".");
File fout=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/"+Dictionary.normalizeLemmaforFile(word.getLemma())+".gmx");;
if(!fout.exists())
{
SmallPair p=new SmallPair(word.getLemmaObject());
for(int i=0;i<p.getLemma().getSenses().size();i++)
{
p.getCounts()[i][i]=(int)wiki.getCounts(p.getLemma().getSenses().get(i),p.getLemma().getSenses().get(i));
for(int j=0;j<p.getLemma().getSenses().size();j++)
{
if(p.getCounts()[i][i]<(2*p.getLemma().getSenses().size()))
{
if(p.getCounts()[i][i]==0)
{
p.getCounts()[i][j]=1;
p.getCounts()[j][i]=1;
}
else
{
p.getCounts()[i][j]=p.getCounts()[i][i];
p.getCounts()[j][i]=p.getCounts()[i][i];
}
}
else
{
if(j>i)
{
p.getCounts()[i][j]=0;
}
}
}
}
for(int i=0;i<p.getLemma().getSenses().size();i++)
{
for(int j=i+1;j<p.getLemma().getSenses().size();j++)
{
if(p.getCounts()[i][i]>=(2*p.getLemma().getSenses().size())&&p.getCounts()[j][j]>=(2*p.getLemma().getSenses().size()))
{
p.getCounts()[i][j]=(int)wiki.getCounts(p.getLemma().getSenses().get(i),p.getLemma().getSenses().get(j));
p.getCounts()[j][i]=p.getCounts()[i][j];
if(p.getCounts()[j][i]>=(2*p.getLemma().getSenses().size()))
{
i=p.getLemma().getSenses().size();
break;
}
}
}
}
Util.writeObject(fout, p);
}
}
}
System.out.println();
}
}
/**
* Returns the sense number of the Sense having the higher frequency on the Wikipedia Dictionary.
* @param lemma The target lemma.
* @return The most frequent sense in the Wikipedia Dictionary.
* @throws Exception
*/
public int getMFS(String lemma) throws Exception {
File fout=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".gmx");
SmallPair p=new SmallPair((Pair)Util.loadObject(fout));
int sel=0;
for(int i=1;i<p.getLemma().getSenses().size();i++)
{
if(p.getCounts()[sel][sel]<p.getCounts()[i][i])
{
sel=i;
}
}
return sel;
}
/**
* If you want to use Wikipedia as a corpus you should look into Wiki.createInputFromArticle.
* This method does nothing. It exist only for avoiding its accidental call.
* @param documents Target document
* @throws Exception
*/
public void transformBoW(Input document) throws Exception
{
}
/**
* Tells is the given lemma accomplishes one sense per discourse rule
* in the Wikipedia Dictionary.
* @param lemma The target lemma.
* @return True is the given lemma accomplishes the rule in this corpus.
* @throws Exception
*/
public boolean isOneSensePerDiscourse(String lemma) throws Exception
{
File fout=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".gmx");;
if(fout.exists())
{
Object o=Util.loadObject(fout);
SmallPair p;
if(o instanceof Pair)
{
p=new SmallPair((Pair)o);
}
else
p=(SmallPair) o;
Lemma l=p.getLemma();
for(int i=0;i<l.getSenses().size();i++)
{
for(int j=i+1;j<l.getSenses().size();j++)
{
if(p.getCounts()[j][i]>=(2*p.getLemma().getSenses().size()))
return false;
}
}
}
return true;
}
/**
* Returns the corpus name.
* @return This corpus name.
*/
public String getName()
{
return this.name;
}
/**
*
* @return The dictionary used for tagging this corpus.
*/
public Dictionary getDict() {
return dict;
}
/**
*
* If you want to use Wikipedia as a corpus you should look into Wiki.createInputFromArticle.
* This method does nothing. It exist only for avoiding its accidental call.
* @param path Path for saving the SuperLemmas.
* @throws Exception
*/
public void WriteSuperLemmas(String path) throws Exception
{
}
}