package gannuNLP.dictionaries;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.regex.Pattern;
import gannuNLP.data.Lemma;
import gannuNLP.data.Relation;
import gannuNLP.data.Sense;
import gannuUtil.KeyArray;
import gannuUtil.KeyString;
/**
* A simple connector to WordNet. Before using it you must set the path with WordNet.setPath(String). WordNet.loadDataBase(String) its the first method that should be executed.
* This connector is a little slow to load because of the counts for IDF calculation.
* The valid versions of WordNet are 3.0, 2.1 and 1.7. Others can be added by using dictLoader command.
* @author Francisco Viveros-Jiménez
*
*/
public class WordNet extends Dictionary{
/**
*
*/
private static final long serialVersionUID = 1L;
public WordNet(){
super();
this.usesPOSTag=true;
this.isWeb=false;
this.path="Resources/WordNet 3.0/";
this.name="WordNet ";
this.language="en";
}
@Override
public void loadCoreData() throws Exception {
//Memory initialization
ArrayList<FileReader> data=new ArrayList<FileReader>(4);
ArrayList<FileReader> index=new ArrayList<FileReader>(4);
senseMaps=new ArrayList<KeyArray>(155290);
senses=new ArrayList<Sense>(117660);
this.loadIrregulars();
glossCount=0.0;
data.add(new FileReader(path+"/data.noun"));
data.add(new FileReader(path+"/data.verb"));
data.add(new FileReader(path+"/data.adj"));
data.add(new FileReader(path+"/data.adv"));
index.add(new FileReader(path+"/index.noun"));
index.add(new FileReader(path+"/index.verb"));
index.add(new FileReader(path+"/index.adj"));
index.add(new FileReader(path+"/index.adv"));
//load prepositions
String line;
prepositions=new ArrayList<String>(70);
//System.out.println("Loading prepositions");
FileReader f=new FileReader("data/prepositions");
BufferedReader in=new BufferedReader(f);
line=in.readLine();
while(!(line==null))
{
prepositions.add(line);
line=in.readLine();
}
in.close();
f.close();
Collections.sort(prepositions);
int i=0;
String tokens[];
//Loading sense mapping
//System.out.println("Loading sense maps");
maxCollocationSize=0;
for(FileReader inf:index)
{
String pos="_"+getPOS(i);
in=new BufferedReader(inf);
line=in.readLine();
while(!(line==null))
{
if(!line.startsWith(" "))
{
tokens=line.split(" ");
int x=tokens[0].split("_").length;
if(x>maxCollocationSize)
maxCollocationSize=x;
senseMaps.add(new KeyArray(tokens[0]+pos, parseSenseMap(tokens,pos)));
}
line=in.readLine();
}
in.close();
inf.close();
i++;
}
Collections.sort(senseMaps);
//Loading glosses
i=0;
System.out.println("Loading dictionary (wait some minutes please!)");
for(FileReader inf:data)
{
in=new BufferedReader(inf);
line=in.readLine();
while(!(line==null))
{
if(!line.startsWith(" "))
{
senses.add(parseGloss(line,i));
}
line=in.readLine();
}
in.close();
inf.close();
i++;
}
glossCount+=(double)senses.size();
Collections.sort(senses);
data.clear();
index.clear();
}
@Override
public void load(String sampleSources) throws Exception {
loadCoreData();
//Load samples & counts
wordCounts=new ArrayList<KeyString>();
String sources[]=sampleSources.split(";");
this.wordCount=0.0;
for(String source:sources)
{
try
{
//System.out.println("Loading "+source+" samples");
FileReader in=new FileReader("Resources/"+this.name+"/samples/"+source);
loadSamplesFromSource(in);
in.close();
//System.out.println("Loading "+source+" counts");
in=new FileReader("Resources/"+this.name+"/counts/"+source);
loadCountsFromFile(in);
}
catch(Exception e)
{
System.out.println("Error: Must parse "+source+" first!");
}
}
System.out.println("Setting word counts for IDF calculation");
wordCounts.trimToSize();
Collections.sort(wordCounts);
}
public void changeSenseId() throws Exception
{
//Change all the synsetIds to senseIds
//First the relations
for(Sense sense:this.senses)
{
for(ArrayList<Relation> rels:sense.getRelations().values())
{
for(Relation rel:rels)
{
Sense s=this.getSense(rel.getSid()+"_"+rel.getPos());
ArrayList<Sense> ss=new ArrayList<Sense>();
int j=-1;
while(ss.size()==0)
{
j++;
ss=this.getSenses(s.getSynonyms().get(j));
}
int i;
for(i=0;i<ss.size();i++)
{
if(ss.get(i).equals(s))
break;
}
rel.setSid(s.getSynonyms().get(j)+"@"+String.valueOf(i));
}
}
}
ArrayList<String> c=new ArrayList<String>();
ArrayList<Sense> newlist=new ArrayList<Sense>(this.senses.size());
ArrayList<KeyString> equivalents=new ArrayList<KeyString>(this.senses.size());
c.add("all");
for(Sense sense:this.senses)
{
ArrayList<Sense> ss=new ArrayList<Sense>();
int j=-1;
while(ss.size()==0)
{
j++;
ss=this.getSenses(sense.getSynonyms().get(j));
}
int i;
for(i=0;i<ss.size();i++)
{
if(ss.get(i).equals(sense))
break;
}
Sense s=new Sense(sense,c);
s.setSid(sense.getSynonyms().get(j)+"@"+String.valueOf(i));
newlist.add(s);
equivalents.add(new KeyString(sense.getSid(),s.getSid()));
}
Collections.sort(newlist);
Collections.sort(equivalents);
this.senses=null;
this.senses=newlist;
for(KeyArray k:this.senseMaps)
{
for(int j=0;j<k.getArray().size();j++)
{
int index=Collections.binarySearch(equivalents, new KeyString(k.getArray().get(j)));
k.getArray().set(j, equivalents.get(index).getString());
}
}
}
@Override
public void parseSamplesFromDictionary() throws Exception {
ArrayList<KeyString> gcounts=new ArrayList<KeyString>();
ArrayList<KeyString> scounts=new ArrayList<KeyString>();
ArrayList<FileReader> data=new ArrayList<FileReader>(4);
data.add(new FileReader(path+"/data.noun"));
data.add(new FileReader(path+"/data.verb"));
data.add(new FileReader(path+"/data.adj"));
data.add(new FileReader(path+"/data.adv"));
System.out.println("Loading glosses and creating parsed files");
File f=new File("./Resources/"+this.name+"/samples/");
f.mkdirs();
f=new File("./Resources/"+this.name+"/counts/");
f.mkdirs();
f=null;
FileWriter writeg=new FileWriter("Resources/"+this.name+"/samples/Glosses");
FileWriter writes=new FileWriter("Resources/"+this.name+"/samples/Samples");
BufferedWriter outg=new BufferedWriter(writeg);
BufferedWriter outs=new BufferedWriter(writes);
FileWriter writecg=new FileWriter("Resources/"+this.name+"/counts/Glosses");
FileWriter writecs=new FileWriter("Resources/"+this.name+"/counts/Samples");
BufferedWriter outcg=new BufferedWriter(writecg);
BufferedWriter outcs=new BufferedWriter(writecs);
int i=0;
int z=0;
for(FileReader inf:data)
{
BufferedReader in=new BufferedReader(inf);
String line=in.readLine();
String pos=getPOS(i);
String lineout;
while(line!=null)
{
if(!line.startsWith(" "))
{
z++;
if(z%1000==0)
System.out.println("Parsing WordNet Synset "+z);
String [] tokens=line.split("\\|");
String sid=tokens[0].split(" ")[0]+"_"+pos;
String []samples=tokens[1].split("\"");
String gloss=samples[0];
lineout=sid+"|"+gloss+"|";
ArrayList<ArrayList<String>> parsedGloss=lemmatize(gloss);
if(parsedGloss.size()==0)
parsedGloss=this.POSTagging(gloss);
for(ArrayList<String> lemmas:parsedGloss)
for(String lemma:lemmas)
{
//String lemma=lemmas.get(0);
lineout+=lemma+" ";
int j=gcounts.indexOf(new KeyString(lemma));
if(j<0)
{
gcounts.add(new KeyString(lemma,"1"));
}
else
{
KeyString ks=gcounts.get(j);
ks.setString(String.valueOf(Integer.parseInt(ks.getString())+1));
}
}
lineout=lineout.substring(0,lineout.length()-1)+"\n";
outg.write(lineout);
for(int k=1;k<samples.length;k++)
{
if(samples[k].split(" ").length>2)
{
String sample=samples[k];
lineout=sid+"|"+sample+"|";
ArrayList<ArrayList<String>> parsedSample=lemmatize(sample);
for(ArrayList<String> lemmas:parsedSample)
for(String lemma:lemmas)
{
lineout+=lemma+" ";
int j=scounts.indexOf(new KeyString(lemma));
if(j<0)
{
scounts.add(new KeyString(lemma,"1"));
}
else
{
KeyString ks=scounts.get(j);
ks.setString(String.valueOf(Integer.parseInt(ks.getString())+1));
}
}
lineout=lineout.substring(0,lineout.length()-1)+"\n";
outs.write(lineout);
}
}
//Parsing the gloss or the samples
}
line=in.readLine();
}
in.close();
inf.close();
i++;
}
for(KeyString ks:gcounts)
{
outcg.write(ks.getKey()+"|"+ks.getString()+"\n");
}
for(KeyString ks:scounts)
{
outcs.write(ks.getKey()+"|"+ks.getString()+"\n");
}
outcg.close();
outcs.close();
writecs.close();
writecg.close();
outg.close();
outs.close();
writes.close();
writeg.close();
}
/**
* Method for extracting the possible synsets of a lemma.
* @param tokens Array containing values of line.split(" ") operation of a
* index file line. POS WordNet data file.
* @param pos The POS tag of the current file.
* @return An ArrayList with the corresponding synsets of a lemma.
*/
private static ArrayList<String> parseSenseMap(String[] tokens, String pos) {
int i=4+Integer.parseInt(tokens[3]);
int limit=Integer.parseInt(tokens[i]);
i+=2;
ArrayList<String>aux=new ArrayList<String>(limit);
for(int z=0;z<limit;z++)
{
aux.add(tokens[i]+pos);
i++;
}
return aux;
}
/**
* Method for extracting a synset from a line of WordNet data. POS WordNet file.
* @param line The line to be processed.
* @param pos The POS tag of the WordNet file.
* @return A synset object.
*/
private Sense parseGloss(String line, int pos) {
String gloss[]=line.split("\\|");
String tokens[]= gloss[0].split(" ");
String postag="_"+getPOS(pos);
int i=4;
int limit=Integer.valueOf(tokens[3],16);
ArrayList<String> synonyms=new ArrayList<String>(limit);
for(int z=0;z<limit;z++)
{
synonyms.add(tokens[i].replaceAll(Pattern.quote("(")+"[a-zA-Z]+"+Pattern.quote(")"), "")+postag);
i+=2;
}
Sense ps=new Sense(tokens[0],getPOS(pos),synonyms);
//Adding relations
limit=Integer.parseInt(tokens[i]);
i++;
for(int z=0;z<limit;z++)
{
String WNType=tokens[i];
String type="";
if(WNType.equals("!"))
type="Antonym";
if(WNType.equals("@"))
type="Hypernym";
if(WNType.equals("@i"))
type="Instance Hypernym";
if(WNType.equals("~"))
type="Hyponym";
if(WNType.equals("~i"))
type="Instance Hyponym";
if(WNType.equals("#m"))
type="Member Holonym";
if(WNType.equals("#s"))
type="Substance Holonym";
if(WNType.equals("#p"))
type="Part Holonym";
if(WNType.equals("%m"))
type="Member Meronym";
if(WNType.equals("%s"))
type="Substance meronym";
if(WNType.equals("%p"))
type="Part meronym";
if(WNType.equals("="))
type="Attribute";
if(WNType.equals("+"))
type="Derivationally related form";
if(WNType.equals(";c"))
type="Domain of synset";
if(WNType.equals("-c"))
type="Member of this domain";
if(WNType.equals(";r"))
type="Region of synset";
if(WNType.equals("-r"))
type="Member of this Region";
if(WNType.equals(";u"))
type="Usage of synset";
if(WNType.equals("-u"))
type="Member of this Usage";
if(WNType.equals("*"))
type="Entailment";
if(WNType.equals(">"))
type="Cause";
if(WNType.equals("^"))
type="Also see";
if(WNType.equals("$"))
type="Verb group";
if(WNType.equals("&"))
type="Similar to";
if(WNType.equals("<"))
type="Participle of verb";
if(WNType.equals("\\"))
type="Pertainym";
ps.addRelation(tokens[i], new Relation(type,tokens[i+1],tokens[i+2]));
i+=4;
}
return ps;
}
/**
* Write all the loaded data into SuperLemma files.
* SuperLemma files act as cache files for all the dictionaries.
*/
public void WriteSuperLemmas(String path)throws Exception
{
this.changeSenseId();
super.WriteSuperLemmas(path);
}
@Override
public Lemma getLemma(String lemma) throws Exception {
ArrayList<Sense> s=this.getSenses(lemma);
Lemma l=null;
if(s.size()>0)
l=new Lemma(lemma,s.get(0).getPos(),s,this.getCounts(lemma),this.name);
return l;
}
@Override
public void setVersion(String version) {
this.name+=version;
}
@Override
public boolean doesLemmaExists(String lemma) throws Exception{
return Collections.binarySearch(senseMaps,lemma)>=0;
}
}