package gannuNLP.corpus;
import gannuNLP.data.AmbiguousWord;
import gannuNLP.data.Count;
import gannuNLP.data.Input;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.data.SuperLemma;
import gannuNLP.dictionaries.Dictionary;
import gannuUtil.Util;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
/**
* Class for loading a corpus of SGF files for calculating some useful
* statistics and extending bag of words with new samples.
* @author Francisco Viveros-Jiménez
*
*/
public class Corpus{
/**
* Documents loaded from the corpus
*/
ArrayList<Input> documents;
/**
* Source dictionary.
*/
Dictionary dict;
/**
* The name of this corpus
*/
String name;
/**
*
* @return documents
*/
public ArrayList<Input> getDocuments() {
return documents;
}
@SuppressWarnings("unchecked")
/**
* Loads a corpus of sgf files from a target folder.
* @param source The target folder containing the target SemCor files.
* @param dict The base dictionary.
* @param noTag True if you want to include undisambiguated words.
* @throws Exception
*/
public Corpus(String source, Dictionary dict,boolean noTag) throws Exception
{
this.dict=dict;
File s=new File(source);
this.name=s.getName();
File d=new File("data/corpora/");
d.mkdirs();
File p=new File("./data/corpora/"+this.name+"_"+String.valueOf(noTag)+".gcf");
if(p.exists())
{
System.out.println("Loading "+p.getName()+"...");
this.documents=(ArrayList<Input>)Util.loadObject(p);
for(Input doc:this.documents)
{
for(AmbiguousWord Word:doc.getAmbiguousWords())
{
Word.setDict(dict);
}
}
}
else
{
this.documents=new ArrayList<Input>();
int i=1;
ArrayList<File> files=Util.getAllSGFFiles(s);
for(File f:files)
{
System.out.println(String.valueOf(i)+"/"+String.valueOf(files.size()));
this.documents.add(new Input(f,null,dict,noTag,false));
i++;
}
this.documents.trimToSize();
System.out.println("Saving "+p.getName()+"...");
Util.writeObject(p, this.getDocuments());
}
}
/**
* Returns the Sense having the higher corpus frequency.
* @param lemma.
* @return The most frequent sense in the corpus.
* @throws Exception
*/
public int getMFS(String lemma) throws Exception {
int maxSenses=100;
int counts[]=new int[maxSenses];
int sel=0;
for(int i=0;i<maxSenses;i++)
{
counts[i]=0;
}
if(!this.dict.isWeb())
{
for(Input doc:this.documents)
{
for(AmbiguousWord word:doc.getAmbiguousWords())
{
if(word.getLemma().equals(lemma))
{
for(int answer: word.getCorrectSenseNumbers())
{
if(answer>=0)
{
counts[answer]++;
}
}
}
}
}
for(int i=1;i<maxSenses;i++)
{
if(counts[i]>counts[sel])
{
sel=i;
}
}
}
else//Search in wiki!!
{
}
if(counts[sel]>0)
return sel;
else
return -1;
}
/**
* Add corpus samples into the document' AmbiguousWords
* @param documents Target document
* @throws Exception
*/
public void transformBoW(Input document) throws Exception
{
for(ArrayList<AmbiguousWord> sentence:document.getSentences())
{
ArrayList<Sense> currentSenses=new ArrayList<Sense>();
ArrayList<String> sample=new ArrayList<String>();
String sampleText="";
for(AmbiguousWord word:sentence)
{
sampleText+= word.getLemma()+" ";
sample.add(word.getLemma());
for(Sense answer: word.getCorrectSenses())
{
if(!currentSenses.contains(answer))
{
currentSenses.add(answer);
}
}
}
for(Sense sense:currentSenses)
{
sense.addBagOfWords(sampleText, sample,this.name);
}
}
}
/**
* Tells is the given lemma accomplishes one sense per discourse rule in this tagged corpus.
* @param lemma The given lemma.
* @return True is the given lemma accomplishes the rule in this corpus.
* @throws Exception
*/
public boolean isOneSensePerDiscourse(String lemma) throws Exception
{
int hits=0;
for(Input doc:this.documents)
{
ArrayList<Integer> senses=new ArrayList<Integer>();
for(AmbiguousWord word:doc.getAmbiguousWords())
{
if(lemma.equals(word.getLemma()))
{
hits++;
for(int answer:word.getCorrectSenseNumbers())
{
Integer ans=new Integer(answer);
if(!senses.contains(ans))
senses.add(ans);
}
}
}
if(senses.size()>1)
return false;
}
return true;
}
/**
* Generates a copy of another corpus object.
* @param original The original corpus
*/
public Corpus(Corpus original)
{
this.dict=original.getDict();
this.documents=new ArrayList<Input>(original.getDocuments().size());
this.documents.addAll(original.getDocuments());
}
public Corpus(Dictionary dict,String name) {
this.dict=dict;
this.name=name;
}
/**
* Returns the corpus name.
* @return This corpus name.
*/
public String getName()
{
return this.name;
}
/**
*
* @return The dictionary used for tagging this corpus.
*/
public Dictionary getDict() {
return dict;
}
/**
* Method for writing SuperLemmas for using it with the DataBroker.
* Use this method for extending dictionary definitions with examples.
* The name of this corpus will be used as future reference for loading the samples with the DataBroker.
* @param path The path where the DataBroker files are being stored.
* @throws Exception
*/
public void WriteSuperLemmas(String path) throws Exception
{
ArrayList<Count> counts=new ArrayList<Count>();
ArrayList<String> lemmas=new ArrayList<String>();
System.out.println("Processing samples!!!");
int d=1;
ArrayList<Update> updates=new ArrayList<Update>();
for(Input document:this.documents)
{
System.out.println(String.valueOf(d)+"/"+String.valueOf(this.documents.size()));
d++;
int i=0;
for(ArrayList<AmbiguousWord> sentence:document.getSentences())
{
ArrayList<String> bow=new ArrayList<String>(sentence.size());
for(AmbiguousWord word:sentence)
{
int index=lemmas.indexOf(word.getLemma());
if(index>=0)
{
counts.get(index).increaseFrequency();
}
else
{
lemmas.add(word.getLemma());
counts.add(new Count(1.0,this.name));
}
bow.add(word.getLemma());
}
for(AmbiguousWord word:sentence)
{
for(int index:word.getCorrectSenseNumbers())
{
if(index>=0)
{
updates.add(new Update(word.getLemma(),index,document.getTextSentences().get(i), bow));
}
}
}
i++;
}
}
Collections.sort(updates);
System.out.println("Saving samples!!!");
d=1;
Update p=null;
SuperLemma s=null;
Lemma l=null;
for(Update u:updates)
{
System.out.println(String.valueOf(d)+"/"+String.valueOf(updates.size()));
d++;
if(p==null||!u.getLemma().equals(p.getLemma()))
{
if(s!=null)
{
l.addCount(counts.get(lemmas.indexOf(p.getLemma())));
this.dict.WriteSuperLemma(path, s);
}
s=this.dict.loadSuperLemma(u.getLemma(),path);
l=s.retrieveLemma(this.dict.getName());
}
Sense sens=l.getSenses().get(u.getSense());
if(!sens.getSamples().contains(u.getText()))
{
sens.addBagOfWords(u.getText(), u.getBow(),this.name);
}
p=u;
}
if(s!=null)
{
l.addCount(counts.get(lemmas.indexOf(p.getLemma())));
this.dict.WriteSuperLemma(path, s);
}
}
}