sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
ArrayList<Content> stack=new ArrayList<Content>();
stack.addAll(body.getContent());
while(stack.size()>0)
{
Content c=stack.get(0);
stack.remove(0);
if(c.getCType().equals(CType.Text))//actual text
{
//a dot creates a new sentence after processing
String line=c.getValue().trim();
while(!line.equals(""))
{
int idx=line.indexOf(" ");
String words;
if(idx>=0)
words=line.substring(0,idx);
else
words=line;
line=line.substring(words.length()).trim();
String punct=words.replaceAll("\\p{Punct}","�");
int index=0;
while(!punct.equals(""))
{
idx=punct.indexOf("�");
String word;
if(idx>=0)
word=punct.substring(0,idx);
else
word=punct;
if(word.equals(""))
{
//first the punctuation then the word
//add a punc node
if(words.charAt(index)=='<')
{
sentence+="\t\t\t\t<punc><</punc>\n";
}
else
{
if(words.charAt(index)=='>')
sentence+="\t\t\t\t<punc>></punc>\n";
else
sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
}
if(words.charAt(index)=='.')
{
sentence+=("\t\t\t</s>\n");
if(sentence.contains("wf"))
{
System.out.print(".");
s++;
paragraph+=sentence;
}
sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
}
index++;
punct=punct.substring(1);
}
else
{
index+=word.length();
sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
sentence+=word;
sentence+="</wf>\n";
punct=punct.substring(word.length());
}
}
}
}
if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
{
Element current=(Element)c;
//tr creates a new sentence after processing
String href=current.getAttributeValue("href");