f.mkdirs();
FileWriter fout=new FileWriter(path+sid+".sgf");
BufferedWriter out=new BufferedWriter(fout);
URL url=new URL(this.path+"wiki/"+sid);
Document xml=this.loadURL(url);
Element body=this.getContentNode(xml);
DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
db.setPath(this.path);
db.load("Glosses");
out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
out.write("<contextfile concordance=\""+this.name.replace("->", ".")+"\">\n");
out.write("\t<context filename=\""+sid.replace("&","&")+"\" paras=\"yes\">\n");
int p=1;
int s=1;
String paragraph="";
String sentence="";
paragraph+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
ArrayList<Content> stack=new ArrayList<Content>();
stack.addAll(body.getContent());
while(stack.size()>0)
{
Content c=stack.get(0);
stack.remove(0);
if(c.getCType().equals(CType.Text))//actual text
{
//a dot creates a new sentence after processing
String line=c.getValue().trim();
while(!line.equals(""))
{
int idx=line.indexOf(" ");
String words;
if(idx>=0)
words=line.substring(0,idx);
else
words=line;
line=line.substring(words.length()).trim();
String punct=words.replaceAll("\\p{Punct}","�");
int index=0;
while(!punct.equals(""))
{
idx=punct.indexOf("�");
String word;
if(idx>=0)
word=punct.substring(0,idx);
else
word=punct;
if(word.equals(""))
{
//first the punctuation then the word
//add a punc node
if(words.charAt(index)=='<')
{
sentence+="\t\t\t\t<punc><</punc>\n";
}
else
{
if(words.charAt(index)=='>')
sentence+="\t\t\t\t<punc>></punc>\n";
else
sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
}
if(words.charAt(index)=='.')
{
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))
{
System.out.print(".");
s++;
paragraph+=sentence;
}
sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
}
index++;
punct=punct.substring(1);
}
else
{
index+=word.length();
sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
sentence+=word;
sentence+="</wf>\n";
punct=punct.substring(word.length());
}
}
}
}
if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
{
Element current=(Element)c;
//tr creates a new sentence after processing
String href=current.getAttributeValue("href");
String aux="navbox";
if(aux.equals(current.getAttributeValue("class")))
break;
if(href!=null&¤t.getName().equals("a")&&!this.isNotAnArticle(href)&&!href.contains("Category:"))
{
if(!href.contains("%25"))
{
while(href.contains("%"))
{
int index=href.indexOf("%");
String first=href.substring(0,index);
if(index>href.length())
index=href.length();
String last=href.substring(index+3);
String hex="0x"+href.substring(index+1,index+3);
byte b[];
if(last.startsWith("%"))
{
b=new byte[2];
b[0]=(byte)Integer.decode(hex).intValue();
b[1]=(byte)Integer.decode("0x"+last.substring(1,3)).intValue();
last=last.substring(3);
}
else
{
b=new byte[1];
b[0]=(byte)Integer.decode(hex).intValue();
}
href=first+new String(b,"UTF-8")+last;
}
}
//Lematize the wiki word
String word=current.getValue();
String lemma=word;
Lemma l=db.getLemma(word);
href=href.substring(href.indexOf("wiki/")+5);
boolean ac=true;
if(l==null)
{
l=db.getLemma(href);
if(l!=null)
lemma=l.getLemma();
ac=false;
}
if(l!=null)
{
int i=0;
boolean ban=false;
for(Sense sense:l.getSenses())
{
i++;
if(sense.getSid().equals(href))
{
ban=true;
break;
}
}
String wnsn="";
if(ban)
{
wnsn=String.valueOf(i);
}
else
{
if(ac)
{
l=db.getLemma(href);
if(l!=null)
{
i=0;
ban=false;
for(Sense sense:l.getSenses())
{
i++;
if(sense.getSid().equals(href))
{
ban=true;
break;
}
}
if(ban)
wnsn=String.valueOf(i);
}
}
}
if(wnsn.equals("")&&l!=null)
{
Sense sense=this.getSense(href);
ban=false;
i=0;
for(Sense sx:l.getSenses())
{
i++;
if(sense.itContainsTheSameSamples(sx))
{
ban=true;
break;
}
}
}
if(ban)
wnsn=String.valueOf(i);
if(wnsn.equals(""))
{ stack.addAll(0,current.getContent());
out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&")+" -->\n");
}
else
{
sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
+"\" wnsn=\""+wnsn
+"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
sentence+=word;
sentence+="</wf>\n";
}
}
else
{
stack.addAll(0,current.getContent());
}
}
else
{
if(current.getName().equals("tr")||current.getName().equals("p"))
{
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))
{
System.out.print(".");
s++;
paragraph+=sentence;
}
if(paragraph.contains("wf"))
{
System.out.println("Saving paragraph "+String.valueOf(p));
p++;
paragraph+=("\t\t</p>\n");
out.write(paragraph.replace("&","&"));
}
s=1;
sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
paragraph="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
}
stack.addAll(0,current.getContent());
}
}
}
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))