package gannuNLP.dictionaries;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.jdom2.Content;
import org.jdom2.Content.CType;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.filter.ElementFilter;
import org.jdom2.filter.Filter;
import org.jdom2.input.JDOMParseException;
import org.jdom2.input.SAXBuilder;
import gannuNLP.data.Count;
import gannuNLP.data.Lemma;
import gannuNLP.data.Relation;
import gannuNLP.data.Sense;
import gannuUtil.Util;
/**
* Connector to Wikipedia dictionary.
* The paragraphs before the Table of Content are taken as definitions.
* Please set up an URL to your own Wikipedia mirror with the setPath method.
* The valid versions of the Wikipedia connector are en (English) and es (Spanish).
* @author Francisco Viveros-Jiménez
*
*/
public class Wiki extends Dictionary {
/**
*
*/
private static final long serialVersionUID = 1L;
/**
* String for identifying a missing page. E.g. "Wikipedia does not have an article with this exact name".
*/
String missingMSG;
/**
* String for identifying a bad Wiki URL. E.g. "Wikimedia Error".
*/
String wikiErrorMSG;
/**
* String for identifying definitions inside specified section of a Wiki article.
* E.g. "redirectToFragment(\"#".
*/
String redirect;
/**
* String for identifying the disambiguation link of an article. E.g. "For other uses".
*/
String disambiguationMSG;
/**
* String for identifying the section containing the search hits in the Wiki search page.
* E.g. "Results ".
*/
String countText;
/**
* Preposition written before the search hits (white space must be included). E.g. "of ".
*/
String countPrepIn;
/**
* Preposition written after the search hits (white space must be included). E.g. " for".
*/
String countPrepOut;
/**
* String for identifying a disambiguation page. E.g. "_(disambiguation)".
*/
String disambiguationWord;
/**
* Internal flag for avoiding infinite loops while crawling.
*/
private boolean jump;
/**
* Internal counts for avoiding infinite connection attempts while crawling.
*/
int attempts;
/**
* List of bad URLs.
*/
ArrayList<String> badSearches;
/**
* Instantiates a new Wikipedia connector.
* You must use selectLanguage, setPath and load methods before querying Wikipedia.
*/
public Wiki()
{
super();
this.attempts=10;
this.isWeb=true;
this.jump=false;
this.usesPOSTag=false;
}
/**
* Sets all the necessary values for processing Wikipedia of an specified language.
* English ("en") and Spanish ("es") are implemented.
* Please modify this method for adding support for other languages.
* @param language String for identifying the language. Please use the exact same code as Wikipedia.
* E.g. "en" for English or "es" for Spanish.
*/
public void setVersion(String language)
{
this.language=language;
if(language.equals("en"))
{
this.disambiguationWord="_(disambiguation)";
this.name="English Wikipedia";
this.missingMSG="Wikipedia does not have an article with this exact name";
this.wikiErrorMSG="Wikimedia Error";
this.redirect="redirectToFragment(\"#";
this.disambiguationMSG="For other uses";
this.countText="Results ";
this.countPrepIn="of ";
this.countPrepOut=" for";
}
if(language.equals("es"))
{
this.disambiguationWord="_(desambiguaci�n)";
this.name="Wikipedia en Espa�ol";
this.missingMSG="Wikipedia a�n no tiene una p�gina llamada";
this.wikiErrorMSG="Wikimedia Error";
this.redirect="redirectToFragment(\"#";
this.disambiguationMSG="Para otras acepciones";
this.countText="Resultados ";
this.countPrepIn="de ";
this.countPrepOut=" para";
}
}
/**
* Method for searching the XML node containing the actual Wiki article of a target document.
* @param xml Target document.
* @return The XML node containing the article.
*/
private Element getContentNode(Document xml)
{
//Search for mw-content-text
//Gloss starts inside the first <p> nodes and ends with the apparition of another node type (like <div> or <h2>)
Element root=xml.getRootElement();
Element body=null;
for(Element e:root.getChildren())
{
if(e.getName().equals("body"))
{
body=e;
break;
}
}
Element content=null;
for(Element e:body.getChildren())
{
if(e.getAttribute("id")!=null&&e.getAttributeValue("id").equals("content"))
{
content=e;
break;
}
}
Element bodycontent=null;
for(Element e:content.getChildren())
{
if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("bodyContent"))
{
bodycontent=e;
break;
}
}
Element gloss=null;
for(Element e:bodycontent.getChildren())
{
if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("mw-content-text"))
{
gloss=e;
break;
}
}
return gloss;
}
@SuppressWarnings("unchecked")
@Override
public void loadCoreData() throws Exception {
URL url=new URL(this.path);
this.glossCount=0.0;
try
{
Document xml=this.loadURL(url);
if(language.equals("en"))
{
for(Element div:xml.getDescendants(new ElementFilter("a")))
{
if(div.getAttributeValue("href")!=null&&div.getAttributeValue("href").equals("/wiki/Special:Statistics"))
{
String str=div.getValue();
System.out.println(str);
this.glossCount=Double.parseDouble(str.replace(",", ""));
break;
}
}
}
else
{
for(Element div:xml.getDescendants(new ElementFilter("li")))
{
if(div.getAttributeValue("id")!=null && div.getAttributeValue("id").contains("lang-"))
{
for(Element span:div.getDescendants(new ElementFilter("span")))
{
if(span.getAttributeValue("lang").equals(language))
{
String str=div.getValue();
str=str.substring(0,str.indexOf(" articles"));
str=str.split("More than ")[1];
System.out.println(str);
this.glossCount=Double.parseDouble(str.replace(",", ""));
}
}
}
}
}
if(language.equals("es"))
{
String f=xml.getRootElement().getValue();
f=f.substring(f.indexOf(" art�culos en espa�ol")-30,f.indexOf("art�culos en espa�ol")).split("\n")[1].replace("\u00a0","");
this.glossCount=Double.parseDouble(f);
}
this.wordCount=this.glossCount*320.0;
url=new URL(this.path);
url.openStream();
}
catch(Exception e)
{
System.out.println("Warning wiki mirror is unreacheable, switching to offline mode!");
}
File f=new File("./data/Wiki.bws");
if(f.exists())
this.badSearches=(ArrayList<String>)Util.loadObject(f);
else
this.badSearches=new ArrayList<String>();
}
@Override
public void load(String sampleSources) throws Exception {
this.loadCoreData();
}
@Override
/**
* Under construction.
*/
public void parseSamplesFromDictionary() throws Exception {
//TODO
}
@Override
public Lemma getLemma(String lemma) throws Exception {
URL url=new URL(this.path+"wiki/"+lemma.replace(" ", "_")+this.disambiguationWord);
Lemma l=null;
int iidx=this.badSearches.indexOf(lemma);
if(iidx<0)
{
Document xml=this.loadURL(url);
String text="";
if(xml!=null)
text=xml.getRootElement().getValue();
if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
{
ArrayList<URL> urls=new ArrayList<URL>();
ArrayList<Integer> levels=new ArrayList<Integer>();
ArrayList<Integer> hindex=new ArrayList<Integer>();
ArrayList<String> heads=new ArrayList<String>();
for(Element div:xml.getDescendants(new ElementFilter("div")))
{
if(div.getAttributeValue("id")!=null&&div.getAttributeValue("id").equals("content"))
{
ElementFilter f=new ElementFilter("a");
Filter<? extends Content> or=f.or(new ElementFilter("span"));
for(Content c:div.getDescendants(or))
{
if(c.getCType()==CType.Element)
{
Element word=(Element)c;
if(word.getName().equals("a"))
{
String href=word.getAttributeValue("href");
if(href!=null&&href.startsWith("/wiki"))
{
urls.add(new URL(this.path+href));
hindex.add(new Integer(heads.size()-1));
}
}
else
{
if(word.getAttributeValue("class")!=null && word.getAttributeValue("class").equals("mw-headline"))
{
heads.add(word.getAttributeValue("id"));
Element e=word.getParentElement();
levels.add(new Integer(Integer.parseInt(e.getName().substring(1))-2));
}
}
}
}
break;
}
}
ArrayList<Sense> senses=new ArrayList<Sense>(urls.size());
for(int i=0;i<urls.size();i++)
{
String urlx=urls.get(i).getFile().replace("/wiki/", "");
if(!this.isNotAnArticle(urlx))//it is a sense
{
senses.add(this.getSense(urlx.replace("/", "")));
}
}
ArrayList<Count> counts=new ArrayList<Count>();
counts.add(this.getWikiCounts(lemma)); ;
l=new Lemma(lemma,"",senses,counts,this.name);
}
else
{
url=new URL(this.path+"wiki/"+lemma.replace(" ", "_"));
xml=this.loadURL(url);
if(xml!=null)
text=xml.getRootElement().getValue();
if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
{
if(text.contains(this.disambiguationMSG)&&!this.jump)
{
this.jump=true;
ArrayList<Sense> senses=new ArrayList<Sense>();
Element body=this.getContentNode(xml);
for(Element e:body.getChildren())
{
if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("dablink"))
{
for(Content c:e.getContent())
{
if(c.getCType().equals(CType.Element))
{
Element a=((Element)c);
if(a.getName().equals("a"))
{
String sid=a.getAttributeValue("href");
sid=sid.substring(sid.indexOf("wiki/")+5);
sid=sid.replace(this.disambiguationWord, "");
Lemma ll=this.getLemma(sid);
if(ll!=null)
{
for(Sense s:ll.getSenses())
{
if(!senses.contains(s))
senses.add(s);
}
}
}
}
}
}
}
this.jump=false;
if(senses.size()>0)
{
ArrayList<Count> counts=new ArrayList<Count>();
counts.add(this.getWikiCounts(lemma));
l=new Lemma(lemma,"",senses,counts,this.name);
}
}
else
{
ArrayList<Sense> senses=new ArrayList<Sense>(1);
String urlx=url.getFile().replace("/wiki/", "");
if(!this.isNotAnArticle(urlx))//it is a sense
{
senses.add(this.getSense(urlx));
ArrayList<Count> counts=new ArrayList<Count>();
counts.add(this.getWikiCounts(lemma));
l=new Lemma(lemma,"",senses,counts,this.name);
}
}
}
}
}
if(l==null&&iidx<0)
{
this.badSearches.add(lemma);
this.badSearches=Util.removeDuplicates(this.badSearches);
Collections.sort(this.badSearches);
File f=new File("./data/Wiki.bws");
Util.writeObject(f, this.badSearches);
}
return l;
}
/**
* Download an specified URL.
* @param url Target URL.
* @return The HTML document.
* @throws Exception
*/
Document loadURL(URL url) throws Exception{
SAXBuilder builder=new SAXBuilder();
Document xml=null;
HttpURLConnection con=(HttpURLConnection) url.openConnection();
con.setInstanceFollowRedirects(true);
InputStream in;
try
{
in = con.getInputStream();
}
catch(Exception e)
{
in=con.getErrorStream();
}
String encoding = con.getContentEncoding();
encoding = encoding == null ? "UTF-8" : encoding;
if(encoding.equals("gzip"))
{
in=new GZIPInputStream(con.getInputStream());
encoding="UTF-8";
}
String body = IOUtils.toString(in, encoding).replace(" & ", "&");
for(int i=0;i<this.attempts;i++)
{
try
{
xml=(Document) builder.build(new StringReader(body));
i+=this.attempts+1;
}
catch(JDOMParseException ex)
{
int line=ex.getLineNumber();
int col=ex.getColumnNumber();
int cline=1;
int index=0;
while(cline<line)
{
index=body.indexOf("\n",index)+1;
cline++;
}
//line found
index+=col;
if(index>=body.length())
index=body.length()-1;
while(index>=0&&body.charAt(index)!='&')
{
index--;
}
if(index>-1)
{
String firstHalf=body.substring(0,index);
String otherHalf=body.substring(index+1);
body=firstHalf+"&"+otherHalf;
}
}
}
return xml;
}
/**
* Download a target URL as plain text.
* @param url Target URL.
* @return Plain text of the URL.
* @throws Exception
*/
String loadURLAsText(URL url) throws Exception{
HttpURLConnection con=(HttpURLConnection) url.openConnection();
con.setInstanceFollowRedirects(true);
InputStream in;
try
{
in = con.getInputStream();
}
catch(Exception e)
{
in=con.getErrorStream();
}
String encoding = con.getContentEncoding();
encoding = encoding == null ? "UTF-8" : encoding;
if(encoding.equals("gzip"))
{
in=new GZIPInputStream(con.getInputStream());
encoding="UTF-8";
}
String body = IOUtils.toString(in, encoding).replace(" & ", "&");
return body;
}
/**
* Return the search hits for a corresponding word.
* @param lemma Target word.
* @return A count object containing the search hits.
* @throws Exception
*/
public Count getWikiCounts(String lemma) throws Exception{
double w=0.0;
URL count=new URL(this.path+"/w/index.php?title=Special%3ASearch&profile=default&search="+lemma.replace(" ", "%20")+"&fulltext=Search");
Document xml=this.loadURL(count);
if(xml!=null)
{
String aux="mw-search-formheader";
for(Element e:this.getContentNode(xml).getDescendants(new ElementFilter("div")))
{
if(aux.equals(e.getAttributeValue("class")))
{
for(Element x:e.getDescendants(new ElementFilter("div")))
{
aux="results-info";
if(aux.equals(x.getAttributeValue("class")))
{
aux=e.getValue().substring(e.getValue().indexOf(this.countText)+this.countText.length());;
aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
aux=aux.substring(0,aux.indexOf(this.countPrepOut));
aux=aux.replaceAll("\\D", "");
w=Double.parseDouble(aux);
break;
}
}
break;
}
}
}
else
{
String aux=this.loadURLAsText(count);
aux=aux.substring(aux.indexOf("mw-search-formheader"));
aux=aux.substring(aux.indexOf("results-info"));
aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
aux=aux.substring(0,aux.indexOf(this.countPrepOut));
aux=aux.replaceAll("\\D", "");
w=Double.parseDouble(aux);
}
Count c=new Count(w, this.name);
return c;
}
public Sense getSense(String sid)throws Exception {
Sense s;
File f=new File("./data/wiki/"+Dictionary.normalizeLemmaforFile(this.getCompleteName())+"/"+Dictionary.normalizeLemmaforFile(sid)+".wco");
File dir=new File("./data/wiki/"+Dictionary.normalizeLemmaforFile(this.getCompleteName())+"/");
dir.mkdirs();
if(f.exists())
{
s=(Sense)Util.loadObject(f);
}
else
{
ArrayList<String> syns=new ArrayList<String>(1);
syns.add(sid);
s=new Sense(sid,"",syns);
URL url=new URL(this.path+"wiki/"+sid.replace(" ", "_"));
Document xml=this.loadURL(url);
Element body=this.getContentNode(xml);
//Check if definition is a section
if(!xml.getRootElement().getValue().contains(this.redirect))
{
boolean start=false;
for(Element e:body.getChildren())
{
if(e.getName().equals("p"))
{
start=true;
s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
for(Element ge:e.getChildren())
{
//Retrieve all the <a> for extracting the inGloss relation
if(ge.getName().equals("a"))
{
String nurl=ge.getAttributeValue("href");
s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));
}
}
}
else
{
if(start)
break;
}
}
}
else//Look for the start of the section
{
String all=xml.getRootElement().getValue();
String section=all.substring(all.indexOf(this.redirect)+this.redirect.length());
section=section.substring(0,section.indexOf("\")"));
boolean start=false;
for(Element e:body.getChildren())
{
if(start)
{
if(e.getName().equals("p"))
{
s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
for(Element ge:e.getChildren())
{
//Retrieve all the <a> for extracting the inGloss relation
if(ge.getName().equals("a"))
{
String nurl=ge.getAttributeValue("href");
s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));
}
}
}
else
{
break;
}
}
else
{
if(e.getName().startsWith("h"))
{
for(Element es:e.getChildren())
{
if(es.getName().equals("span")&§ion.equals(es.getAttributeValue("id")))
{
start=true;
}
}
}
}
}
}
Element navbox=null;
for(Element e:xml.getRootElement().getDescendants(new ElementFilter("table")))
{
if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("navbox"))
{
navbox=e;
break;
}
}
//Add inNavBox relations
if(navbox!=null)
{
for(Element e:navbox.getDescendants(new ElementFilter("a")))
{
String nurl=e.getAttributeValue("href");
if(nurl!=null&&!this.isNotAnArticle(nurl))
{
s.addRelation("inNavBox", new Relation("inNavBox", nurl.replace("/wiki/", ""), ""));
}
}
}
//Add in CatLinks relations
Element catlinks=null;
String aux="catlinks";
for(Element e:body.getParent().getDescendants(new ElementFilter("div")))
{
if(aux.equals(e.getAttributeValue("id")))
{
catlinks=e;
break;
}
}
if(catlinks!=null)
{
for(Element e:catlinks.getDescendants(new ElementFilter("a")))
{
String nurl=e.getAttributeValue("href");
if(nurl!=null&&!this.isNotAnArticle(nurl))
{
s.addRelation("inCatLinks", new Relation("inCatLinks", nurl.replace("/wiki/", ""), ""));
}
}
}
Util.writeObject(f, s);
}
return s;
}
/**
* Tells if the current URL is an actual article.
* Please modify this method for adding pages that you believe must be excluded or
* for adding support to a new language.
* @param url Target URL.
* @return True is the target page is not an article.
*/
public boolean isNotAnArticle(String url)
{
boolean ban=url.contains("Template:");
ban=ban||url.contains("w/index.php?")||url.contains("Template_talk:")||url.contains("Help:");
ban=ban||url.contains("Wikipedia:")||url.contains("File:")||url.contains(".");
ban=ban||url.contains("Special:")||url.contains("#cite_note")||url.startsWith("#");
ban=ban||url.contains("Especial:")||url.contains("Ayuda:")||url.contains("Category:");
ban=ban||url.contains("Categor�a:")||url.contains("Talk:")||url.contains("Discusi�n:");
ban=ban||url.contains("List_of")||url.contains("Lists_of")||url.contains(this.disambiguationWord);
return ban;
}
/**
* Method for creating an SGF file from a Wikipedia article.
* Use this method for creating a corpus made from Wikipedia articles.
* @param sid Name of the article as specified in the Wikipedia URL.
* E.g. "Gray_wolf", "Iron_man", etc.
* @param path Folder for saving the SGF file.
* @throws Exception
*/
@SuppressWarnings("unused")
public void createInputFromArticle(String sid,String path)throws Exception
{
File f=new File(path);
f.mkdirs();
FileWriter fout=new FileWriter(path+sid+".sgf");
BufferedWriter out=new BufferedWriter(fout);
URL url=new URL(this.path+"wiki/"+sid);
Document xml=this.loadURL(url);
Element body=this.getContentNode(xml);
DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
db.setPath(this.path);
db.load("Glosses");
out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
out.write("<contextfile concordance=\""+this.name.replace("->", ".")+"\">\n");
out.write("\t<context filename=\""+sid.replace("&","&")+"\" paras=\"yes\">\n");
int p=1;
int s=1;
String paragraph="";
String sentence="";
paragraph+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
ArrayList<Content> stack=new ArrayList<Content>();
stack.addAll(body.getContent());
while(stack.size()>0)
{
Content c=stack.get(0);
stack.remove(0);
if(c.getCType().equals(CType.Text))//actual text
{
//a dot creates a new sentence after processing
String line=c.getValue().trim();
while(!line.equals(""))
{
int idx=line.indexOf(" ");
String words;
if(idx>=0)
words=line.substring(0,idx);
else
words=line;
line=line.substring(words.length()).trim();
String punct=words.replaceAll("\\p{Punct}","�");
int index=0;
while(!punct.equals(""))
{
idx=punct.indexOf("�");
String word;
if(idx>=0)
word=punct.substring(0,idx);
else
word=punct;
if(word.equals(""))
{
//first the punctuation then the word
//add a punc node
if(words.charAt(index)=='<')
{
sentence+="\t\t\t\t<punc><</punc>\n";
}
else
{
if(words.charAt(index)=='>')
sentence+="\t\t\t\t<punc>></punc>\n";
else
sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
}
if(words.charAt(index)=='.')
{
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))
{
System.out.print(".");
s++;
paragraph+=sentence;
}
sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
}
index++;
punct=punct.substring(1);
}
else
{
index+=word.length();
sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
sentence+=word;
sentence+="</wf>\n";
punct=punct.substring(word.length());
}
}
}
}
if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
{
Element current=(Element)c;
//tr creates a new sentence after processing
String href=current.getAttributeValue("href");
String aux="navbox";
if(aux.equals(current.getAttributeValue("class")))
break;
if(href!=null&¤t.getName().equals("a")&&!this.isNotAnArticle(href)&&!href.contains("Category:"))
{
if(!href.contains("%25"))
{
while(href.contains("%"))
{
int index=href.indexOf("%");
String first=href.substring(0,index);
if(index>href.length())
index=href.length();
String last=href.substring(index+3);
String hex="0x"+href.substring(index+1,index+3);
byte b[];
if(last.startsWith("%"))
{
b=new byte[2];
b[0]=(byte)Integer.decode(hex).intValue();
b[1]=(byte)Integer.decode("0x"+last.substring(1,3)).intValue();
last=last.substring(3);
}
else
{
b=new byte[1];
b[0]=(byte)Integer.decode(hex).intValue();
}
href=first+new String(b,"UTF-8")+last;
}
}
//Lematize the wiki word
String word=current.getValue();
String lemma=word;
Lemma l=db.getLemma(word);
href=href.substring(href.indexOf("wiki/")+5);
boolean ac=true;
if(l==null)
{
l=db.getLemma(href);
if(l!=null)
lemma=l.getLemma();
ac=false;
}
if(l!=null)
{
int i=0;
boolean ban=false;
for(Sense sense:l.getSenses())
{
i++;
if(sense.getSid().equals(href))
{
ban=true;
break;
}
}
String wnsn="";
if(ban)
{
wnsn=String.valueOf(i);
}
else
{
if(ac)
{
l=db.getLemma(href);
if(l!=null)
{
i=0;
ban=false;
for(Sense sense:l.getSenses())
{
i++;
if(sense.getSid().equals(href))
{
ban=true;
break;
}
}
if(ban)
wnsn=String.valueOf(i);
}
}
}
if(wnsn.equals("")&&l!=null)
{
Sense sense=this.getSense(href);
ban=false;
i=0;
for(Sense sx:l.getSenses())
{
i++;
if(sense.itContainsTheSameSamples(sx))
{
ban=true;
break;
}
}
}
if(ban)
wnsn=String.valueOf(i);
if(wnsn.equals(""))
{ stack.addAll(0,current.getContent());
out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&")+" -->\n");
}
else
{
sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
+"\" wnsn=\""+wnsn
+"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
sentence+=word;
sentence+="</wf>\n";
}
}
else
{
stack.addAll(0,current.getContent());
}
}
else
{
if(current.getName().equals("tr")||current.getName().equals("p"))
{
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))
{
System.out.print(".");
s++;
paragraph+=sentence;
}
if(paragraph.contains("wf"))
{
System.out.println("Saving paragraph "+String.valueOf(p));
p++;
paragraph+=("\t\t</p>\n");
out.write(paragraph.replace("&","&"));
}
s=1;
sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
paragraph="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
}
stack.addAll(0,current.getContent());
}
}
}
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))
{
System.out.print(".");
s++;
paragraph+=sentence;
}
if(paragraph.contains("wf"))
{
System.out.println("Saving paragraph "+String.valueOf(p));
p++;
paragraph+=("\t\t</p>\n");
out.write(paragraph.replace("&","&"));
}
out.write("\t</context>\n");
out.write("</contextfile>\n");
out.close();
fout.close();
}
/**
* Returns the search hits of two simultaneous senses.
* @param sense Target sense.
* @param sense2 The other target sense.
* @return The search hits of pages containing both senses.
* @throws Exception
*/
public double getCounts(Sense sense, Sense sense2) throws Exception{
double w=0.0;
String qry="\""+sense.getSid()+"\"";;
if(!sense.equals(sense2))
{
qry+="+AND+"+"\""+sense2.getSid()+"\"";
}
URL count=new URL(this.path+"/w/index.php?title=Special%3ASearch&profile=default&search="+qry+"&fulltext=Search");
Document xml=this.loadURL(count);
if(xml!=null)
{
String aux="mw-search-formheader";
for(Element e:this.getContentNode(xml).getDescendants(new ElementFilter("div")))
{
if(aux.equals(e.getAttributeValue("class")))
{
for(Element x:e.getDescendants(new ElementFilter("div")))
{
aux="results-info";
if(aux.equals(x.getAttributeValue("class")))
{
aux=e.getValue().substring(e.getValue().indexOf(this.countText)+this.countText.length());;
aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
aux=aux.substring(0,aux.indexOf(this.countPrepOut));
aux=aux.replaceAll("\\D", "");
w=Double.parseDouble(aux);
break;
}
}
break;
}
}
}
else
{
String aux=this.loadURLAsText(count);
aux=aux.substring(aux.indexOf("mw-search-formheader"));
aux=aux.substring(aux.indexOf("results-info"));
aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
aux=aux.substring(0,aux.indexOf(this.countPrepOut));
aux=aux.replaceAll("\\D", "");
w=Double.parseDouble(aux);
}
return w;
}
@Override
public boolean doesLemmaExists(String lemma) throws Exception {
return this.getLemma(lemma)!=null;
}
}