package gannuNLP.dictionaries;
import gannuNLP.data.Count;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.data.SuperLemma;
import gannuUtil.Util;
import gannuWSD.DataLoader;
import gannuWSD.bowmodifiers.BoWModifier;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
/**
* DataBroker exists for accessing all the available dictionaries through a single object.
* DataBroker has the following features: <br/>
* (1) It maintains a cache of SuperLemma objects for avoiding loading entire off-line dictionaries
* and accessing the online ones.<br/>
* (2) It maintains a cache of lemma objects for avoiding recalculation of the BoWs
* made by BoWModifier objects.<br/>
* (3) It allows you to load samples from previously processed sample sources into the BoW.
* SuperLemma files allows a faster access to the dictionary/corpus data for each lemma.<br/>
* (4) It saves a good amount of memory usage at the cost of some hard drive storage.
* Therefore, DataBroker is the main way for accessing data from dictionaries and sample sources.
*
* @author Francisco Viveros-Jiménez
*
*/
public class DataBroker extends Dictionary{
/**
* Method for setting the base URL of web dictionaries.
*/
public void setBaseURL(String URL)
{
if(this.isWeb)
{
this.source.setPath(URL);
}
}
private static final long serialVersionUID = 1L;
/**
* Active BoWModifiers being used in this session.
*/
ArrayList<BoWModifier> modifiers;
/**
* Base dictionary.
*/
Dictionary source;
/**
* List of lemmas which IDF counts are already loaded in the cache.
*/
ArrayList<String> lemmas;
/**
* Cache of counts for calculating IDF.
*/
ArrayList<Double> counts;
/**
* List of loaded sample sources.
*/
ArrayList<String> sampleSources;
/**
* Tells is the dictionary is ready to go.
*/
boolean loaded;
/**
*
* @return The base dictionary.
*/
public Dictionary getSource() {
return source;
}
/**
* Creates a DataBroker for loading senses from an specified source dictionary.
* @param Source Full class name of the source dictionary.
*/
public DataBroker(String Source)throws Exception{
super();
this.modifiers=new ArrayList<BoWModifier>();
this.loaded=false;
this.source=(Dictionary)Class.forName(Source).newInstance();
this.isWeb=this.source.isWeb();
this.path=".";
this.name=this.source.getName();
this.glossCount=0;
this.lemmas=new ArrayList<String>();
this.counts=new ArrayList<Double>();
this.usesPOSTag=this.source.usesPOSTag;
}
public DataBroker(String Source, String version) throws Exception{
this.modifiers=new ArrayList<BoWModifier>();
this.loaded=false;
this.source=(Dictionary)Class.forName(Source).newInstance();
this.source.setVersion(version);
this.language=this.source.getLanguage();
this.isWeb=this.source.isWeb();
this.path=".";
this.name=this.source.getName();
this.glossCount=0;
this.lemmas=new ArrayList<String>();
this.counts=new ArrayList<Double>();
this.usesPOSTag=this.source.usesPOSTag;
}
@Override
/**
* Loads the glossCount and wordCount data from the .sta files.
* You should create the SuperLemmas first by executing targetDictionaryClass.WriteSuperLemmas() method.
*/
public void loadCoreData() throws Exception {
//Loading Statistics from dictionary
try
{
FileReader f=new FileReader("./data/"+getName()+"/"+getName()+".sta");
BufferedReader in=new BufferedReader(f);
this.glossCount=Double.parseDouble(in.readLine());
this.wordCount=Double.parseDouble(in.readLine());
in.close();
f.close();
}
catch(Exception e)
{
// Dictionary not parsed, initialize it.
if(!this.source.isWeb())
throw new Exception("Error: Dictionary not loaded, use the java -jar dictLoader.jar connectorClass dictFilesPath GANNUpath command for parsing the dictionary first!");
}
if(this.source.isWeb())
{
this.source.setVersion(this.language);
this.source.setPath(this.path);
this.source.loadCoreData();
File aux=new File("./data/"+getName()+"/"+getName()+".sta");
if(!aux.exists())
{
File f=new File("./data/"+getName()+"/");
f.mkdirs();
FileWriter fout=new FileWriter("./data/"+getName()+"/"+getName()+".sta");
BufferedWriter out=new BufferedWriter(fout);
this.glossCount=this.source.getGlossCount();
this.wordCount=this.source.getWordCount();
out.write(String.valueOf(this.glossCount)+"\n");
out.write(String.valueOf(this.wordCount)+"\n");
out.close();
fout.close();
}
aux=new File("./data/"+getName()+".sl");
if(!aux.exists())
{
DataLoader.addSourceList(aux, getName());
}
}
}
@Override
/**
* Calls this.loadCoreData() and sets the valid sample sources.
* @param sampleSources Valid sample sources.
* Use Glosses for loading dictionary definitions and samples.
* Use the corpus name for loading corpus samples.
* The String should be written in the format "source1(,sourceN)*".
*/
public void load(String sampleSources) throws Exception {
loadCoreData();
// Set the corpus valid sources
this.sampleSources=new ArrayList<String>(5);
for(String source:sampleSources.split(","))
{
if(source.equals("Glosses"))
this.sampleSources.add(this.name);
else
this.sampleSources.add(source);
}
}
@Override
/**
* Creates base files for loading a dictionary from zero.
* It is recommended to use gannu.tools.dictLoader class instead.
*/
public void parseSamplesFromDictionary() throws Exception {
this.loadSource();
this.source.parseSamplesFromDictionary();
}
/**
* Loads a source dictionary if needed for doing operations other than queries
* like lemmatization.
* @throws Exception
*/
private void loadSource() throws Exception
{
if(!this.loaded)
{
this.loaded=true;
this.source.loadCoreData();
}
}
/**
* Retrieves the corresponding senses of a lemma extracted directly from the
* base dictionary. This method is used for skipping the cache usage (not recommended).
* @param lemma The target lemma.
* @return An ArrayList containing the corresponding senses for the target Lemma.
*/
public ArrayList<Sense> getSenses(String lemma) throws Exception
{
if(this.isWeb)
{
return this.getLemma(lemma).getSenses();
}
else
{
ArrayList<String> targetWords=new ArrayList<String>();
ArrayList<String> posTags=new ArrayList<String>();
if(this.usesPOSTag)
{
if(lemma.matches(".*_\\w"))
{
targetWords.add(lemma);
posTags.add(lemma.substring(lemma.length()-2));
}
else
{
targetWords.add(lemma+"_N");
targetWords.add(lemma+"_V");
targetWords.add(lemma+"_J");
targetWords.add(lemma+"_R");
posTags.add("_N");
posTags.add("_V");
posTags.add("_J");
posTags.add("_R");
}
}
else
{
posTags.add("");
targetWords.add(lemma);
}
ArrayList<String> lemmatas=new ArrayList<String>();
ArrayList<Sense> senses=new ArrayList<Sense>();
for(String plemma:targetWords)
{
lemmatas.addAll(this.Morphy(plemma));
}
for(String plemma:Util.removeDuplicates(lemmatas))
{
for(String posTag:posTags)
{
Lemma l=this.getLemma(plemma+posTag);
if(l!=null)
{
for(Sense s:l.getSenses())
{
if(!senses.contains(s))
{
senses.add(s);
}
}
}
}
}
return senses;
}
}
/**
* Adds a BoWModifier for this session.
* @param mod The BoWModifier being added,
*/
public void addModifier(BoWModifier mod)
{
this.modifiers.add(mod);
}
/**
* Return the IDF value of a lemma.
* This method creates a cache by using the this.lemmas
* and this.counts lists for speeding up the process.
* @param lemma The target lemma.
* @return The IDF value. The minimum frequency of a lemma is 1 for avoiding calculation errors.
*/
public double getIDF(String lemma)throws Exception
{
File tmp=new File("./data/idfs/"+this.getCompleteName().replace(">", "@@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".idf");
int index=this.lemmas.indexOf(lemma);
double count=1.0;
if(index<0)
{
if(tmp.exists())
{
count=((Double)Util.loadObject(tmp)).doubleValue();
return Math.log((this.glossCount)/count);
}
else
{
File dir=new File("./data/idfs/"+this.getCompleteName().replace(">", "@@@@@@")+"/");
if(!dir.exists())
{
dir.mkdirs();
}
}
File ft=new File(this.path+"/data/"+this.getName()+"/"+Dictionary.normalizeLemmaforFile(lemma)+".slm");
if(ft.exists())
{
Lemma l=this.getLemmaNoModifiers(lemma);
if(l!=null)
for(Count c:l.getCounts())
count+=c.getFrequency();
}
this.lemmas.add(lemma);
this.counts.add(new Double(count));
}
else
{
count=this.counts.get(index).doubleValue();
}
try
{
Util.writeObject(tmp, new Double(count));
}
catch(Exception e)
{
System.out.print("!");
}
return Math.log((this.glossCount)/count);
}
/**
* Returns a Lemma without using the BoWModifiers active for this session.
* @param lemma The target lemma.
* @return The corresponding Lemma object.
* @throws Exception
*/
public Lemma getLemmaNoModifiers(String lemma)throws Exception
{
File d=new File("./data/lemmas/"+this.getName()+"/");
d.mkdirs();
SuperLemma s=this.loadSuperLemma(lemma,"./data/"+this.getName()+"/");
Lemma l=null;
if(s.getLemmas().size()>0)
{
l=s.retrieveLemma(this.source.toString());
if(l!=null)
{
l=new Lemma(l,this.sampleSources);
}
}
if(l==null&&this.source.isWeb())
{
l=this.source.getLemma(lemma);
if(l!=null)
{
s.addLemma(l);
this.WriteSuperLemma("./data/"+this.getName()+"/",s);
}
}
return l;
}
/**
* @return Returns the following String: "this.name+"_"+this.sampleSources.toString()+"_"+this.modifiers.toString()".
*/
public String getCompleteName()
{
String text="_[";
for(String t:this.sampleSources)
{
if(t.length()>4)
{
text+=t.substring(0,4)+";";
}
else
{
text+=t+";";
}
}
text+="][";
for(BoWModifier bow:this.modifiers)
{
text+=bow.getName().substring(0,4);
if(bow.getParams().size()>0)
text+="#"+bow.getSimplifiedParameterString().replace("/","_");
text+=";";
}
text+="]";
return this.name+text;
}
@Override
/**
* Returns a Lemma from the dictionary after processing it with the active BoWModifiers.
* This method uses the cache often and is the one that handles the /data/lemmas/ folder.
* @param lemma The target lemma.
* @return Lemma The corresponding Lemma object.
*/
public Lemma getLemma(String lemma)throws Exception
{
//Search in cache
File f=new File("./data/lemmas/"+this.getCompleteName().replace(">", "@@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".glm");
if(f.exists())
{
return (Lemma) Util.loadObject(f);
}
else
{
File d=new File("./data/lemmas/"+this.getCompleteName().replace(">", "@@@@@@")+"/");
d.mkdirs();
d=new File("./data/lemmas/"+this.getName()+"/");
d.mkdirs();
}
SuperLemma s=this.loadSuperLemma(lemma,"./data/"+this.getName()+"/");
Lemma l=null;
if(s.getLemmas().size()>0)
{
l=s.retrieveLemma(this.source.toString());
if(l!=null)
{
l=new Lemma(l,this.sampleSources);
}
}
if(l==null&&this.source.isWeb())
{
l=this.source.getLemma(lemma);
if(l!=null)
{
s.addLemma(l);
this.WriteSuperLemma("./data/"+this.getName()+"/",s);
}
}
if(l!=null)
{
for(BoWModifier mod:this.modifiers)
{
mod.modifyBow(l);
}
}
if(l!=null)
{
l.trim();
Util.writeObject(f, l);
}
return l;
}
/**
* @return A String containing the following: "super.toString()+":"+this.modifiers.toString()".
*/
public String toString()
{
return super.toString()+":"+this.modifiers.toString();
}
/**
* Returns the corresponding sense of a target SenseId.
* @param sid The target SenseId.
* @return The corresponding Sense object.
*/
public Sense getSense(String sid)throws Exception
{
if(!this.isWeb)
{
String lemma=sid.split("@")[0];
Lemma l=this.getLemmaNoModifiers(lemma);
if(sid.split("@").length>1)
{
int senseNumber=Integer.parseInt(sid.split("@")[1]);
return l.getSenses().get(senseNumber);
}
else
return null;
}
else
{
return this.source.getSense(sid);
}
}
@Override
public void setVersion(String version) {
this.language=version;
}
@Override
public boolean doesLemmaExists(String lemma) throws Exception{
if(!this.isWeb)
{
File f=new File("./data/lemmas/"+this.getCompleteName().replace(">", "@@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".glm");
return f.exists();
}
else
{
return this.getLemma(lemma)!=null;
}
}
}