package gannuNLP.dictionaries;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import gannuNLP.data.Count;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.data.SuperLemma;
import gannuUtil.KeyArray;
import gannuUtil.KeyString;
import gannuUtil.Util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* Template class for creating dictionary connectors.
* @author Francisco Viveros-Jiménez
*
*/
public abstract class Dictionary implements Serializable{
/**
* Unique instance of the POS tagger. It should be initialized using the setTagger method.
*/
static MaxentTagger tagger;
/**
* Loads the corresponding Stanford POS tagger for this dictionary.
*/
void setTagger() throws Exception
{
if(Dictionary.tagger==null)
{
if(this.language.equals("en"))
Dictionary.tagger=new MaxentTagger("./data/taggermodels/"+this.language+"/english-left3words-distsim.tagger");
if(this.language.equals("es"))
{
//TODO
}
}
}
/**
* Flag for telling if the dictionary uses POSTag.
*/
boolean usesPOSTag;
/**
*
* @return this.usesPOSTag
*/
public boolean itUsesPOSTag()
{
return this.usesPOSTag;
}
/**
* String for identifying the language. Please use the exact same code as Wikipedia.
* E.g. "en" for English or "es" for Spanish.
*/
String language;
/**
*
* @return this.language
*/
public String getLanguage()
{
return language;
}
/**
* Method for configuring the version/language of the dictionary.
* @param version The version/language of the dictionary.
*/
public abstract void setVersion(String version);
/**
* @return This dictionary's name with modifiers.
*/
public String getCompleteName()
{
return name;
}
/**
*
*/
private static final long serialVersionUID = 1L;
/**
* Modifies the name of this dictionary.s
* @param name The new name of this dictionary.
*/
public void setName(String name)
{
this.name=name;
}
/**
*
* @return this.name
*/
public String getName() {
return name;
}
/**
*
* @return this.path
*/
public String getPath() {
return path;
}
/**
* Tells is the dictionary is a Web dictionary.
*/
protected boolean isWeb;
/**
* Dictionary name
*/
protected String name;
/**
* The path of dictionary resources
*/
protected String path;
/**
* Contains the frequency of each lemma on the loaded samples.
*/
protected ArrayList<KeyString> wordCounts;
/**
* Contains the mapping between a lemma and its possible senses. Lemmas have
* the format lemma_P, where P is the POStag (N, V, A, R). This list its always sorted by the lemma field.
*/
protected ArrayList<KeyArray> senseMaps;
/**
* Contains dictionary senses with lemmatized definitions. Lemmatization was done
* using methods provided in this class. This list allows searching by a senseId.
*/
protected ArrayList<Sense> senses;
/**
* Contains memory map for irregular morphs. Morphs are stored in the following format:
* ("irregular_morp","base_morp1[base_morphN]*"). exceptions its sorted
* by irregular_morph.
*/
protected ArrayList<KeyString> exceptions;
/**
* Contains the list of prepositions stored in the preposition file. Prepositions
* are used for detecting some verb collocations in Morphy. preposition its sorted.
*/
protected ArrayList<String> prepositions;
/**
* Number of loaded definitions/samples.
*/
protected double glossCount;
/**
* Number of words contained in the definitions/samples
*/
protected double wordCount;
/**
* Maximum word size from collocations stored in the dictionary.
*/
protected int maxCollocationSize;
/**
* Returns lemma/senses memory mapping.
* @return synMaps
*/
public ArrayList<KeyArray> getSynMaps() {
return this.senseMaps;
}
/**
* Returns glosses list.
* @return senses
*/
public ArrayList<Sense> getSenses() {
return this.senses;
}
/**
* Returns list with irregular morphs.
* @return exceptions
*/
public ArrayList<KeyString> getExceptions() {
return this.exceptions;
}
/**
* Tell us if the dictionary is loaded from the Web.
* @return True if the dictionary is loaded from the Web.
*/
public boolean isWeb()
{
return this.isWeb;
}
/**
* Returns a list with the prepositions.
* @return prepositions
*/
public ArrayList<String> getPrepositions() {
return this.prepositions;
}
/**
* Return a count of the senses in dictionary.
* @return glossCount
*/
public double getGlossCount() {
return glossCount;
}
/**
* Returns the maximum word size from collocations stored in dictionary.
* @return maxCollocationSize
*/
public int getMaxCollocationSize() {
return maxCollocationSize;
}
/**
* Retrieve a sense by its synsetId using binary search over glosses mapping.
* @param sid The synsetId in format "Number_P".
* @return The corresponding sense object.
* @throws Exception
*/
public Sense getSense(String sid) throws Exception
{
return senses.get(Collections.binarySearch(senses, sid));
}
/**
* Retrieve IDF for a lemma. Lemmas are in the format "lemma_P".
* IDF is calculated by using each Sense as a document.
* Note that even by loading <a href="http://www.cse.unt.edu/~rada/downloads.html">SemCor</a>" the samples will be added
* to a single Sense object. This means that samples are attached to its
* corresponding Sense.
* @param lemma The lemma to look for.
* @return A double with the calculated IDF value
* @throws Exception
*/
public double getIDF(String lemma) throws Exception
{
int i=Collections.binarySearch(wordCounts,lemma);
if(i>=0)
return Math.log((double)glossCount/(1.0+Integer.parseInt(wordCounts.get(i).getString())));
else
return Math.log((double)glossCount);
}
/**
* loadCoreData exists for allowing to parse new <a href="http://www.cse.unt.edu/~rada/downloads.html">SemCor</a>" files.
* loadData loads sense information, mappings and relations.
* However, it does not load glosses and samples.
* After, executing loadCoreData you can execute the method parseSamplesFromSemCor(String) for adding a new sample corpus.
* @throws Exception
*/
public abstract void loadCoreData()throws Exception;
/**
* Reads files in the path folder and creates
* the memory mapping for all the terms in dictionary.
* This method loads the samples for the bag of words.
* Each dictionary has 2 different sample sources:
* (1)Glosses (dictionary definitions),
* (2)Samples (dictionary samples).
* In addition, there exists corresponding corpus for some dictionaries, like (SemCor for WordNet 3.0).
* Corpus are classified by dictionary in the Resources folder.
* You can load your own corpus if you parse them first with the method parseSamplesFromSemCor(String).
* Samples should be in valid SemCor format.
* @param sampleSources A string with the sources that will form the bag of words.
* Some valid examples are: "Glosses", "Glosses;Samples", "Glosses;Samples;SemCor;yoursamplesource"
* @throws Exception
*/
public abstract void load(String sampleSources)throws Exception;/**
* Loads a count file.
* A count file contains how many times a word appears in the sample source.
* @param input Dictionary/corpus to be loaded.
* @throws Exception
*/
public void loadCountsFromFile(FileReader input) throws Exception{
BufferedReader in=new BufferedReader(input);
String line=in.readLine();
while(line!=null)
{
String tokens[]=line.split("\\|");
int index=wordCounts.indexOf(new KeyString(tokens[0]));
if(index>=0)
{
KeyString ks=wordCounts.get(index);
ks.setString(String.valueOf(Integer.parseInt(ks.getString())+Integer.parseInt(tokens[1])));
}
else
wordCounts.add(new KeyString(tokens[0],tokens[1]));
line=in.readLine();
}
in.close();
}
/**
* This method parses the samples and counts for a valid SemCor format source.
* After using this method you can load the samples from it by loading the source name.
* @param source The name of the file or the folder that contains SemCor valid format files.
* An error will be raised if a no SemCor file is mixed in the source folder.
* @throws Exception
*/
public void parseSamplesFromSemCor(String source) throws Exception {
File fs=new File(source);
ArrayList<File> files=Util.getAllFiles(fs);
ArrayList<KeyString> counts=new ArrayList<KeyString>();
FileWriter writers=new FileWriter(path+"samples/"+fs.getName());
FileWriter writerc=new FileWriter(path+"counts/"+fs.getName());
BufferedWriter outs=new BufferedWriter(writers);
BufferedWriter outc=new BufferedWriter(writerc);
for(File file:files)
{
DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = fact.newDocumentBuilder();
Document test=builder.parse(file);
NodeList sentences=test.getElementsByTagName("s");
//Extracting lemmas from each sentence
for(int i=0;i<sentences.getLength();i++)
{
System.out.println("Parsing "+ file.getName() +" sentence "+(i+1));
Element sentence=(Element)sentences.item(i);
String sample=" ";
ArrayList<String> lemmas=new ArrayList<String>(20);
ArrayList<String> sids=new ArrayList<String>(20);
NodeList words=sentence.getElementsByTagName("wf");
for(int j=0;j<words.getLength();j++)
{
Element word=(Element)words.item(j);
sample+=word.getChildNodes().item(0).getNodeValue()+" ";
if(word.getAttribute("cmd").equals("done")&&word.getAttribute("lemma").length()>0)
{
String pos=word.getAttribute("pos").substring(0,1).toUpperCase();
if(pos.equals("W"))
pos="R";
lemmas.add(word.getAttribute("lemma")+"_"+pos);
ArrayList<Sense> senses=getSenses(word.getAttribute("lemma")+"_"+pos);
if(senses.size()>0)
{
String []correct=word.getAttribute("wnsn").split(";");
for(String c:correct)
{
if(c.equals("U"))
{
for(Sense sense:senses)
sids.add(sense.getSid());
}
else
{
int index=Integer.parseInt(c)-1;
if(index<senses.size()&&index>=0)
{
sids.add(senses.get(index).getSid());
}
}
}
}
}
}
sample=sample.substring(0, sample.length()-1);
String outline="";
for(String sid:sids)
{
outline=sid+"|"+sample+"|";
for(String lemma:lemmas)
{
outline+=lemma+" ";
}
outline=outline.substring(0, outline.length()-1);
outs.write(outline+"\n");
}
for(String lemma:lemmas)
{
KeyString ks=new KeyString(lemma,"1");
int index=counts.indexOf(ks);
if(index>=0)
{
ks=counts.get(index);
ks.setString(String.valueOf(Integer.parseInt(ks.getString())+1));
}
else
{
counts.add(ks);
}
}
}
}
for(KeyString ks:counts)
{
outc.write(ks.getKey()+"|"+ks.getString()+"\n");
}
outs.close();
outc.close();
writers.close();
writerc.close();
}
/**
* Utility method for parsing dictionary glosses and samples.
* Try to avoid its usage, unless you modify the dictionary or
* erase the files in samples and counts resource folders.
* @throws Exception
*/
public abstract void parseSamplesFromDictionary() throws Exception;
/**
* Load the samples from a parsed source.
* @param input The loaded source.
* @throws Exception
*/
public void loadSamplesFromSource(FileReader input) throws Exception
{
BufferedReader in=new BufferedReader(input);
String line=in.readLine();
while(line!=null)
{
String tokens[]=line.split("\\|");
Sense ps=getSense(tokens[0]);
if(tokens.length>2)
{
ps.addBagOfWords(tokens[1], tokens[2].split(" "),this.name);
this.wordCount+=(double)(tokens[2].split(" ").length);
}
else
ps.addBagOfWords(tokens[1], new String[]{""},this.name);
line=in.readLine();
}
in.close();
}
/**
* This method its similar to wn command of WordNet. getLemma uses a lemma in
* the format "lemma_P". First, base forms are retrieved with Morphy. Then,
* senses are retrieved for the corresponding base forms.
* @param lemma The lemma to look for.
* @return An ArrayList with the senses for the lemma. An empty ArrayList will be
* returned if the lemma wasn't found.
* @throws Exception
*/
public ArrayList<Sense> getSenses(String lemma) throws Exception
{
String postag=lemma.substring(lemma.length()-1);
if(postag.equals("W"))
postag="R";
int pos=getPOS(postag);
postag="_"+postag;
ArrayList<String>lemmas=Morphy(lemma.substring(0,lemma.length()-2), pos);
ArrayList<Sense> senses=new ArrayList<Sense>(4);
if(pos>=0)
{
for(String lemmata:lemmas)
{
try
{
ArrayList<String> sids=senseMaps.get(Collections.binarySearch(senseMaps,lemmata+postag)).getArray();
for(String sid:sids)
{
senses.add(this.senses.get(Collections.binarySearch(this.senses, sid)));
}
}
catch(Exception e)
{
}
}
}
//if(senses.size()==0)
// System.out.println(lemma+" not found");
return senses;
}
/**
* Implementation of a lexical processor.
* @param morph The word to be processed.
* @param postag The POS tag of the word ("N","V","A","R").
* @return A list with the possible corresponding lemmas found in dictionary.
* @throws Exception
*/
public ArrayList<String> Morphy(String morph) throws Exception
{
if(this.usesPOSTag)
{
return Morphy(morph.substring(0,morph.length()-2),morph.substring(morph.length()-1));
}
else
{
return Morphy(morph,"");
}
}
/**
* Implementation of a lexical processor.
* @param morph The word to be processed.
* @param postag The POS tag of the word ("N","V","A","R").
* @return A list with the possible corresponding lemmas found in dictionary.
* @throws Exception
*/
public ArrayList<String> Morphy(String morph, String postag) throws Exception
{
return Morphy(morph,getPOS(postag));
}
/**
* Implementation of a lexical processor.
* @param morph The word to be processed.
* @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
* @return A list with the possible corresponding lemmas found in dictionary.
* @throws Exception
*/
public ArrayList<String> Morphy(String morph, int pos) throws Exception
{
ArrayList<String> lemmas=new ArrayList<String>(5);
String postag;
if(this.usesPOSTag)
postag="_"+getPOS(pos);
else
postag="";
if(pos>=0||!this.usesPOSTag)
{
//transform the morph with simple transformation rules
ArrayList<String> lemmatas=Transform(morph,pos);
lemmatas.addAll(Transform(morph.toLowerCase(),pos));
for(String lemmata:lemmatas)
{
if(this.doesLemmaExists(lemmata+postag))
{
if(!lemmas.contains(lemmata))
lemmas.add(lemmata);
}
}
}
return lemmas;
}
/**
* Method that leads to the execution of the proper morphological unit generator.
* Currently, is available for the English language.
* @param morph The word to be processed.
* @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
* @return A list with the possible base forms for the word.
* The List could contain duplicates and invalid words.
* @throws Exception
*/
public ArrayList<String> Transform(String morph, int pos) throws Exception
{
if(this.language.equals("en"))
{
return this.EnglishMorphy(morph, pos);
}
return new ArrayList<String>();
}
/**
* Morphological unit generator for English language.
* @param morph The word to be processed.
* @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
* @return A list with the possible base forms for the word.
* The List could contain duplicates and invalid words.
* @throws Exception
*/
private ArrayList<String> EnglishMorphy(String morph, int pos) throws Exception{
ArrayList<String> lemmas=new ArrayList<String>(8);
//include the morph as it
lemmas.add(morph);
switch(pos)
{
case 0:
if(morph.contains("_"))
{
String split[]=morph.split("_");
if(split.length==2)
{
ArrayList<String> aux=Morphy(split[0], 0);
ArrayList<String> aux2=Morphy(split[1], 0);
for(String x:aux)
for(String y:aux2)
lemmas.add(x+"_"+y);
aux=Morphy(split[0], 0);
aux2=Morphy(split[1], 1);
for(String x:aux)
for(String y:aux2)
lemmas.add(x+"_"+y);
aux=Morphy(split[0], 2);
aux2=Morphy(split[1], 0);
for(String x:aux)
for(String y:aux2)
lemmas.add(x+"_"+y);
}
}
if(morph.endsWith("ful"))
{
ArrayList<String> aux=Morphy(morph.substring(0,morph.length()-3), pos);
for(String x:aux)
lemmas.add(x+"ful");
}
if(morph.endsWith("ses")||morph.endsWith("xes")||morph.endsWith("zes")||morph.endsWith("ches")||morph.endsWith("shes"))
{
lemmas.add(morph.substring(0, morph.length()-2));
}
if(morph.endsWith("ies"))
{
lemmas.add(morph.substring(0, morph.length()-3)+"y");
}
if(morph.endsWith("s"))
{
lemmas.add(morph.substring(0, morph.length()-1));
}
if(morph.endsWith("men"))
{
lemmas.add(morph.substring(0, morph.length()-2)+"an");
}
break;
case 1:
if(morph.contains("_"))
{
if(hasPrepositions(morph))
{
String split[]=morph.split("_");
ArrayList<String> aux=Morphy(split[0], 1);
ArrayList<String> aux2=Morphy(split[split.length-1], 0);
if(aux.size()==0)
aux.add(split[0]);
if(aux2.size()==0)
aux2.add(split[split.length-1]);
for(String x:aux)
for(String y:aux2)
{
String t=x;
for(int i=1;i<split.length-1;i++)
t+="_"+split[i];
t+="_"+y;
lemmas.add(t);
}
}
else
{
String split[]=morph.split("_");
if(split.length==2)
{
ArrayList<String> aux=Morphy(split[0], 1);
ArrayList<String> aux2=Morphy(split[1], 0);
for(String x:aux)
for(String y:aux2)
lemmas.add(x+"_"+y);
}
}
}
if(morph.endsWith("es")||morph.endsWith("ed"))
{
lemmas.add(morph.substring(0,morph.length()-1));
lemmas.add(morph.substring(0,morph.length()-2));
}
if(morph.endsWith("ies"))
{
lemmas.add(morph.substring(0, morph.length()-3)+"y");
}
if(morph.endsWith("s"))
{
lemmas.add(morph.substring(0, morph.length()-1));
}
if(morph.endsWith("ing"))
{
lemmas.add(morph.substring(0,morph.length()-3)+"e");
lemmas.add(morph.substring(0,morph.length()-3));
}
break;
case 2:
if(morph.endsWith("er"))
{
lemmas.add(morph.substring(0,morph.length()-1));
lemmas.add(morph.substring(0,morph.length()-2));
}
if(morph.endsWith("est"))
{
lemmas.add(morph.substring(0,morph.length()-2));
lemmas.add(morph.substring(0,morph.length()-3));
}
break;
}
this.loadIrregulars();
int i=Collections.binarySearch(exceptions, morph+"_"+Dictionary.getPOS(pos));
if(i>-1)
{
String lemmatas2[]=exceptions.get(i).getString().split(";");
for(String lemmata:lemmatas2)
if(!lemmas.contains(lemmata))
lemmas.add(lemmata);
}
return lemmas;
}
void loadIrregulars() throws Exception{
if(this.exceptions==null)
{
if(this.language.equals("en"))
{
this.loadEnglishIrregulars();
}
if(this.language.equals("es"))
{
//TODO
}
}
}
void loadEnglishIrregulars()throws Exception
{
//Loading exception list
System.out.println("Loading exception list");
exceptions=new ArrayList<KeyString>(5960);
ArrayList<FileReader> exc=new ArrayList<FileReader>(4);
exc.add(new FileReader("data/exceptions/en/noun.exc"));
exc.add(new FileReader("data/exceptions/en/verb.exc"));
exc.add(new FileReader("data/exceptions/en/adj.exc"));
exc.add(new FileReader("data/exceptions/en/adv.exc"));
String line;
int i=0;
String tokens[];
int j;
for(FileReader inf:exc)
{
String pos="_"+getPOS(i);
BufferedReader in=new BufferedReader(inf);
line=in.readLine();
while(!(line==null))
{
tokens=line.split(" ");
for(j=2;j<tokens.length;j++)
tokens[1]+=";"+tokens[j];
exceptions.add(new KeyString(tokens[0]+pos,tokens[1]));
line=in.readLine();
}
in.close();
inf.close();
i++;
}
Collections.sort(exceptions);
exc.clear();
}
/**
* Method for detecting if a collocation has a preposition in it.
* @param morph The collocation to process. White spaces must be replaced with "_".
* @return true if the collocation has a preposition in it.
*/
public boolean hasPrepositions(String morph) {
String split[]=morph.split("_");
for(int i=1;i<split.length-1;i++)
if(prepositions.contains(split[i]))
return true;
return false;
}
/**
* Returns the corresponding POS tag
* @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
* @return The POS tag. An empty string if pos was not a valid number
*/
public static String getPOS(int pos) {
String postag="";
switch(pos)
{
case 0:
postag+="N";
break;
case 1:
postag+="V";
break;
case 2:
postag+="J";
break;
case 3:
postag+="R";
break;
}
return postag;
}
/**
* Lemmatizer that uses Morphy as morphological processor
* and Stanford's Part-Of-Speech Tagger.
* @param line The text to be processed.
* @return An ArrayList that contains an ArrayList of the possible lemmas of each word.
* Most of the words have one corresponding lemma in WordNet, but, there are some exceptions. For
* example: axes_N-> (axis_N, ax_N, axe_N).
* @throws Exception
*/
public ArrayList<ArrayList<String>> lemmatize(String line) throws Exception
{
this.setTagger();
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(line));
ArrayList<TaggedWord> tSentence= new ArrayList<TaggedWord>();
for (List<HasWord> sentence : sentences) {
tSentence.addAll(Dictionary.tagger.tagSentence(sentence));
}
ArrayList<ArrayList<String>> aux=new ArrayList<ArrayList<String>>(tSentence.size());
ArrayList<String> temp;
ArrayList<String> lemmas;
String comp,pos;
int i;
//Morphy eater
while(tSentence.size()>0)
{
comp="";
pos=tSentence.get(0).tag().substring(0, 1);
//try to eat the most tokens as possible
for(i=tSentence.size()>maxCollocationSize?maxCollocationSize:tSentence.size();i>0;i--)
{
comp=tSentence.get(0).word().toLowerCase();
for(int j=1;j<i;j++)
{
comp+="_"+tSentence.get(j).word().toLowerCase();
}
temp=Morphy(comp,pos);
if(temp.size()==0 && i>1)//try for the last element value
{
temp=Morphy(comp,tSentence.get(i-1).tag().substring(0, 1));
if(temp.size()>0)
pos=tSentence.get(i-1).tag().substring(0, 1);
}
//temp must contain something for eat something
if(temp.size()>0)
{
lemmas= new ArrayList<String>();
for(String lemma:temp)
if(!lemmas.contains(lemma+"_"+pos))
lemmas.add(lemma+"_"+pos);
aux.add(lemmas);
temp=null;
break;
}
}
//eat i tokens
if(i==0)
i++;
while(i>0)
{
tSentence.remove(0);
i--;
}
}
return aux;
}
/**
* Open-class words extracted with the
* Part-Of-Speech Tagger.
* This lemmatizer returns an unlemmatized list of tagged words.
* @param line The text to be processed.
* @param tagger An instance of the tagger.
* @return An ArrayList that contains the tagged words.
* @throws Exception
*/
public ArrayList<ArrayList<String>> POSTagging(String line) throws Exception
{
this.setTagger();
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(line));
ArrayList<TaggedWord> tSentence= new ArrayList<TaggedWord>();
for (List<HasWord> sentence : sentences) {
tSentence.addAll(Dictionary.tagger.tagSentence(sentence));
}
ArrayList<ArrayList<String>> aux=new ArrayList<ArrayList<String>>(tSentence.size());
String pos;
for(TaggedWord word:tSentence)
{
pos=word.tag().substring(0, 1);
if(getPOS(pos)>=0)
{
ArrayList<String> x=new ArrayList<String>(1);
x.add(word.word()+"_"+pos);
aux.add(x);
}
}
return aux;
}
/**
* Returns the corresponding POS tag
* @param pos The POS tag of the word ("N=0","V=1","A=2","R=3","W=3").
* @return The POS tag. -1 if pos its an invalid tag.
*/
public static int getPOS(String pos) {
int postag=-1;
if(pos.equals("N"))
postag=0;
if(pos.equals("V"))
postag=1;
if(pos.equals("J"))
postag=2;
if(pos.equals("R")||pos.equals("W"))
postag=3;
return postag;
}
/**
* Sets the path/URL of the source dictionary.
* @param path
*/
public void setPath(String path)
{
this.path=path;
}
@Override
public String toString()
{
return name;
}
/**
* Write all the loaded data into SuperLemma files.
* SuperLemma files act as cache files for all the dictionaries.
*/
public void WriteSuperLemmas(String path)throws Exception
{
int i=0;
FileWriter f=new FileWriter(path+this.name+".sta");
BufferedWriter out=new BufferedWriter(f);
out.write(String.valueOf(this.glossCount)+"\n");
out.write(String.valueOf(this.wordCount)+"\n");
out.close();
f.close();
for(KeyArray Key:this.senseMaps)
{
String lemma=Key.getKey();
ArrayList<Sense> senses=this.getSenses(lemma);
ArrayList<Count> counts=this.getCounts(lemma);
SuperLemma s=this.loadSuperLemma(lemma,path);
Lemma lemmaO=new Lemma(lemma,senses.get(0).getPos(),senses,counts,this.name);
s.addLemma(lemmaO);
this.WriteSuperLemma(path,s);
i++;
if(i%1000==0)
{
System.out.println("Saving slm file "+String.valueOf(i)+"/"+String.valueOf(this.senseMaps.size()));
}
}
}
/**
* Returns the frequency of a word in the dictionary.
* @param lemma The target word.
* @return The frequency is stored inside an ArrayList<Count> of size=1.
*/
public ArrayList<Count> getCounts(String lemma) {
int i=Collections.binarySearch(wordCounts,lemma);
ArrayList<Count> counts=new ArrayList<Count>();
Count c;
if(i>=0)
c=new Count(Double.parseDouble(wordCounts.get(i).getString()),this.name);
else
c=new Count(0.0,this.name);
counts.add(c);
return counts;
}
/**
* Loads a super lemma from an slm file in the data folder.
* @param lemma The super lemma to be loaded.
* @return The super lemma loaded from the file or a new super lemma.
*/
public static String normalizeLemmaforFile(String lemma)
{
return lemma.replace("/", "+++").replace("?", "@@@").replace("\"", "@@@@").replace(":","@@@@@").replace(">", "@@@@@@").replace("*","xxxxxx");
}
/**
* Loads a SuperLemma from a file.
* @param lemma The target lemma.
* @param path The path containing this superlemma.
* @return The corresponding SuperLemma if any. Otherwise, it returns a new empty SuperLemma.
* @throws Exception
*/
public SuperLemma loadSuperLemma(String lemma,String path) throws Exception{
File ft=new File(path+Dictionary.normalizeLemmaforFile(lemma)+".slm");
SuperLemma s;
if(ft.exists())
{
s=(SuperLemma)Util.loadObject(ft);
}
else
{
s=new SuperLemma(lemma);
}
return s;
}
/**
* Writes a file containing the target super lemma.
* @param superlemma The target super lemma.
*/
public void WriteSuperLemma(String path, SuperLemma superlemma) throws Exception{
Util.writeObject(new File(path+Dictionary.normalizeLemmaforFile(superlemma.getLemma())+".slm"), superlemma);
}
/**
* Returns the number of words in the definitions/samples.
* @return The number of words in the definitions/samples.
*/
public double getWordCount()
{
return this.wordCount;
}
/**
* Lemmatizes a full text returning lemmatized paragraphs. It automatically
* does sentence splitting and lemmatization.
* Lemmas not found in the dictionary are keeped in its base form.
* The prefix "###" is added for identifying unlemmatized open-class words.
* Powered by Stanford POS tagger.
* @param text Target text.
* @param chunks Empty ArrayList for storing the original sentence in chunks having the same structure as the return value.
* @return An ArrayList having paragraphs as elements. Each paragraph (Element) contains
* a set of sentences. Each sentence contains a set of words.
*/
public ArrayList<ArrayList<ArrayList<String>>> lemmatize(ArrayList<String> text,ArrayList<ArrayList<ArrayList<String>>> chunks) throws Exception{
ArrayList<ArrayList<ArrayList<String>>> ptext=new ArrayList<ArrayList<ArrayList<String>>>(text.size());
Lemma l=null;
this.setTagger();
for(String p:text)
{
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(p));
ArrayList<ArrayList<String>> paragraph=new ArrayList<ArrayList<String>>(sentences.size());
ArrayList<ArrayList<String>> cparagraph=new ArrayList<ArrayList<String>>(sentences.size());
ArrayList<TaggedWord> tSentence= new ArrayList<TaggedWord>();
for(List<HasWord> sentence:sentences)
{
tSentence.clear();
tSentence.addAll(Dictionary.tagger.tagSentence(sentence));
ArrayList<String> psentence=new ArrayList<String>(tSentence.size());
ArrayList<String> csentence=new ArrayList<String>(tSentence.size());
String lemmata,lemma,pos;
for(TaggedWord word:tSentence)
{
pos=word.tag().substring(0,1).toUpperCase();
lemma=word.word();
csentence.add(word.word());
if(this.usesPOSTag)
{
lemmata=lemma;
lemma=lemma+"_"+pos;
if(pos.equals("N")||pos.equals("J")||pos.equals("R")||pos.equals("V"))
{
ArrayList<String>lemmatas=this.Morphy(lemmata, pos);
for(String plemmata:lemmatas)
{
l=this.getLemma(plemmata+"_"+pos);
if(l!=null)
{
lemma=plemmata+"_"+pos;
break;
}
}
}
}
else
{
l=this.getLemma(lemma);
}
psentence.add(lemma);
l=null;
}
paragraph.add(psentence);
cparagraph.add(csentence);
}
ptext.add(paragraph);
chunks.add(cparagraph);
}
return ptext;
}
/**
* Returns a Lemma object for the corresponding lemma.
* @param lemma The target lemma.
* @return The corresponding lemma object or null when the target lemma does not exists in the dictionary.
* @throws Exception
*/
public abstract Lemma getLemma(String lemma) throws Exception;
/**
*
* @param lemmaObject
* @return Corresponding IDF value
*/
public double getIDF(Lemma lemmaObject) {
double w=1.0;
for(Count c:lemmaObject.counts)
{
w+=c.getFrequency();
}
return Math.log(this.glossCount/w);
}
public static boolean isPunctuation(String pos) {
return pos.matches("\\W");
}
/**
* Method that tells if a target lemma has entries in this dictionary.
* @param lemma Target lemma.
* @return True, if an entry exists corresponding to the target lemma.
*/
public abstract boolean doesLemmaExists(String lemma) throws Exception;
}