package gannuNLP.data;
import gannuNLP.dictionaries.DataBroker;
import gannuNLP.dictionaries.Dictionary;
import gannuWSD.sensefilters.SenseFilter;
import gannuWSD.testing.Decision;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* Class for loading a sgf file.
* @author Francisco Viveros-Jiménez
*
*/
public class Input implements Serializable{
/**
*
*/
private static final long serialVersionUID = 1L;
/**
* An ArrayList containing the document organized in paragraphs, sentences and words.
*/
ArrayList<ArrayList<ArrayList<AmbiguousWord>>> contain;
/**
* An ArrayList containing this document sentences.
*/
ArrayList<String> textSentences;
/**
* An ArrayList with the AmbiguousWords of this document.
*/
ArrayList<AmbiguousWord> ambiguousWords;
/**
* A simple index indicating the position (appearances) of each lemma on the document.
*/
HashMap<String,ArrayList<Integer>>Index;
/**
* This file's name.
*/
String name;
/**
* Returns an ArrayList with all the words of this document that are useful for WSD.
* @return ambiguousWords
*/
public ArrayList<AmbiguousWord> getAmbiguousWords() {
return ambiguousWords;
}
/**
* Returns a HashMap indicating the position (appearances) of each lemma on the document.
* @return Index
*/
public HashMap<String, ArrayList<Integer>> getIndex() {
return Index;
}
/**
* Parse a sgf text for RAW tagging.
* @param targetText Target sgf text.
* @param dict Source dictionary.
* @throws Exception
*/
public Input(String targetText, Dictionary dict)throws Exception
{
int running=0;
int target=0;
DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = fact.newDocumentBuilder();
StringReader fin=new StringReader(targetText);
BufferedReader in=new BufferedReader(fin);
String line;
String body="";
while((line=in.readLine())!=null)
{
body+=line+"\n";
}
in.close();
fin.close();
Document test=builder.parse(new ByteArrayInputStream(body.getBytes("UTF-8")));
this.name="tmp";
NodeList ws=test.getElementsByTagName("wf");
this.ambiguousWords=new ArrayList<AmbiguousWord>(ws.getLength());
int index=0;
NodeList paragraphs=test.getElementsByTagName("p");
this.textSentences=new ArrayList<String>();
this.contain=new ArrayList<ArrayList<ArrayList<AmbiguousWord>>>();
for(int p=0;p<paragraphs.getLength()||(paragraphs.getLength()==0&&p==0);p++)
{
NodeList sn;
if(paragraphs.getLength()==0)
{
sn=test.getElementsByTagName("s");
}
else
{
sn=((Element)paragraphs.item(p)).getElementsByTagName("s");
}
ArrayList<ArrayList<AmbiguousWord>> sentences=new ArrayList<ArrayList<AmbiguousWord>>();
for(int s=0;s<sn.getLength();s++)
{
ArrayList<AmbiguousWord> words=new ArrayList<AmbiguousWord>();
Element sentence=((Element)sn.item(s));
NodeList wds=sentence.getElementsByTagName("wf");
String text="";
for(int w=0;w<wds.getLength();w++)
{
System.out.print(".");
Element word=(Element)wds.item(w);
AmbiguousWord nword=null;
String pos=word.getAttribute("pos");
if(!pos.equals(""))
pos=pos.substring(0,1).toUpperCase();
if(pos.equals("W"))
pos="R";
if(word.getAttribute("cmd").equals("tag"))
{
running++;
target++;
if(pos.equals("N")||pos.equals("V")||pos.equals("R")||pos.equals("J")||pos.equals(""))
{
if(pos.equals("")||pos.equals("NA"))
nword=new AmbiguousWord(word.getAttribute("lemma"),word.getAttribute("wnsn"),index,dict);
else
nword=new AmbiguousWord(word.getAttribute("lemma")+"_"+pos,word.getAttribute("wnsn"),index,dict);
this.ambiguousWords.add(nword);
index++;
words.add(nword);
}
}
if(word.getTextContent()!=null)
text+=word.getTextContent()+" ";
}
if(words.size()>0)
{
words.trimToSize();
this.textSentences.add(text.trim());
sentences.add(words);
}
}
sentences.trimToSize();
this.contain.add(sentences);
}
this.ambiguousWords.trimToSize();
this.contain.trimToSize();
this.Indexing();
this.calculateTF();
System.out.println();
System.out.println("Running words: "+String.valueOf(running));
System.out.println("Target words: "+String.valueOf(target));
this.load();
System.out.println();
}
/**
* Parse a sgf file.
* @param file Target file.
* @param filter SenseFilter for excluding some senses from the words.
* @param dict Source dictionary.
* @param noTag Set it true for including words not manually tagged such as open-class words or untagged words.
* Set it false for excluding these words.
* @param fullLoad Set it true for loading all the LemmaObjects from the dictionary.
* Set it false for loading AmbiguousWords with null LemmaObjects for memory saving.
* @throws Exception
*/
public Input(File file, SenseFilter filter, Dictionary dict,boolean noTag, boolean fullLoad)throws Exception
{
int running=0;
int target=0;
DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = fact.newDocumentBuilder();
FileReader fin=new FileReader(file);
BufferedReader in=new BufferedReader(fin);
String line;
String body="";
while((line=in.readLine())!=null)
{
body+=line+"\n";
}
in.close();
fin.close();
Document test=builder.parse(new ByteArrayInputStream(body.getBytes("UTF-8")));
this.name=file.getName();
NodeList ws=test.getElementsByTagName("wf");
this.ambiguousWords=new ArrayList<AmbiguousWord>(ws.getLength());
int index=0;
NodeList paragraphs=test.getElementsByTagName("p");
this.textSentences=new ArrayList<String>();
this.contain=new ArrayList<ArrayList<ArrayList<AmbiguousWord>>>();
for(int p=0;p<paragraphs.getLength()||(paragraphs.getLength()==0&&p==0);p++)
{
NodeList sn;
if(paragraphs.getLength()==0)
{
sn=test.getElementsByTagName("s");
}
else
{
sn=((Element)paragraphs.item(p)).getElementsByTagName("s");
}
ArrayList<ArrayList<AmbiguousWord>> sentences=new ArrayList<ArrayList<AmbiguousWord>>();
for(int s=0;s<sn.getLength();s++)
{
ArrayList<AmbiguousWord> words=new ArrayList<AmbiguousWord>();
Element sentence=((Element)sn.item(s));
NodeList wds=sentence.getElementsByTagName("wf");
String text="";
for(int w=0;w<wds.getLength();w++)
{
System.out.print(".");
Element word=(Element)wds.item(w);
AmbiguousWord nword=null;
String pos=word.getAttribute("pos");
if(!pos.equals(""))
pos=pos.substring(0,1).toUpperCase();
if(pos.equals("W"))
pos="R";
if(word.getAttribute("cmd").equals("done"))
{
running++;
target++;
if(pos.equals("N")||pos.equals("V")||pos.equals("R")||pos.equals("J")||pos.equals(""))
{
if(pos.equals("")||pos.equals("NA"))
nword=new AmbiguousWord(word.getAttribute("lemma"),word.getAttribute("wnsn"),index,dict);
else
nword=new AmbiguousWord(word.getAttribute("lemma")+"_"+pos,word.getAttribute("wnsn"),index,dict);
if(filter!=null)
filter.prune(nword);
this.ambiguousWords.add(nword);
index++;
words.add(nword);
}
}
if(noTag&&nword==null)
{
running++;
String value;
if(word.getAttribute("lemma")!=null&&!word.getAttribute("lemma").equals(""))
value=word.getAttribute("lemma");
else
{
if(word.getAttribute("pos").equals(""))
value=word.getTextContent().toLowerCase();
else
value=word.getTextContent().toLowerCase()+"_"+word.getAttribute("pos").substring(0,1);
}
nword=new AmbiguousWord(value,index,dict);
this.ambiguousWords.add(nword);
index++;
words.add(nword);
}
if(word.getTextContent()!=null)
text+=word.getTextContent()+" ";
}
if(words.size()>0)
{
words.trimToSize();
this.textSentences.add(text.trim());
sentences.add(words);
}
}
sentences.trimToSize();
this.contain.add(sentences);
}
this.ambiguousWords.trimToSize();
this.contain.trimToSize();
this.Indexing();
this.calculateTF();
System.out.println();
System.out.println("Running words: "+String.valueOf(running));
System.out.println("Target words: "+String.valueOf(target));
if(fullLoad)
{
this.load();
System.out.println();
}
}
/**
* Method for loading all the corresponding LemmaObjects from the base dictionary.
* @throws Exception
*/
public void load() throws Exception {
//Loading lemmas
double poly=0.0;
int count=0;
System.out.println("Loading lemmas from dictionary please wait");
for(Entry<String,ArrayList<Integer>> entry:this.Index.entrySet())
{
System.out.print(".");
count++;
AmbiguousWord word=this.ambiguousWords.get(entry.getValue().get(0).intValue());
Lemma l=word.getLemmaObject();
if(l!=null)
{
poly+=((double)word.getSenses().size());
for(int i=1;i<entry.getValue().size();i++)
{
count++;
word=this.ambiguousWords.get(entry.getValue().get(i).intValue());
word.setLemmaObject(l);
poly+=((double)word.getSenses().size());
}
}
else
{
System.out.print("!");
}
}
System.out.println("\nPolysemy: "+String.valueOf(poly/((double)count)));
}
/**
* Calculates the TF value for all the ambiguous words in this document.
*/
private void calculateTF() {
for(AmbiguousWord word:this.ambiguousWords)
{
if(word.getCorrectSenseNumbers().length>0)
{
ArrayList<Integer> aux=this.Index.get(word.getLemma());
word.setTf((double)aux.size()/(double)this.ambiguousWords.size());
}
else
{
double tf=0.0;
for(AmbiguousWord tword:this.ambiguousWords)
{
if(word.getLemma().equals(tword.getLemma()))
{
tf+=1.0;
}
}
word.setTf(tf/(double)this.ambiguousWords.size());
}
}
}
/**
* Creates the index of lemma appearances.
*/
private void Indexing()
{
Index=new HashMap<String,ArrayList<Integer>>();
int i=0;
for(AmbiguousWord word:this.ambiguousWords)
{
if(word.getCorrectSenseNumbers().length>0)
{
if(Index.containsKey(word.getLemma()))
{
Index.get(word.getLemma()).add(new Integer(i));
}
else
{
ArrayList<Integer> aux=new ArrayList<Integer>(5);
aux.add(new Integer(i));
Index.put(word.getLemma(), aux);
}
}
i++;
}
}
/**
* Returns the name of this file.
*/
public String toString()
{
return name;
}
/**
*
* @return All the sentences in this document
*/
public ArrayList<ArrayList<AmbiguousWord>> getSentences() {
ArrayList<ArrayList<AmbiguousWord>> sentences=new ArrayList<ArrayList<AmbiguousWord>>();
for(ArrayList<ArrayList<AmbiguousWord>> ss:this.contain)
{
sentences.addAll(ss);
}
return sentences;
}
/**
*
* @return this.textSentences
*/
public ArrayList<String> getTextSentences()
{
return this.textSentences;
}
/**
* Returns the TF-IDF value for a word.
* @param word Target word.
* @return The TF-IDF value.
*/
public double getTFIDF(String word) {
ArrayList<Integer> aux=this.Index.get(word);
if(aux!=null)
return this.ambiguousWords.get(aux.get(0)).getTf()*this.ambiguousWords.get(aux.get(0)).getIDF();
return 0.0;
}
/**
* Generates an SGF file from plain text.
* @param target Target file containing the plain text.
* @param data Base DataBroker instance.
* @throws Exception
*/
public static String generateSGFFromRaw(File target, DataBroker data)throws Exception {
FileReader fin=new FileReader(target);
BufferedReader in=new BufferedReader(fin);
String ftext="";
ArrayList<String> text=new ArrayList<String>();
String line;
while((line=in.readLine())!=null)
{
text.add(line);
}
ArrayList<ArrayList<ArrayList<String>>> chunks=new ArrayList<ArrayList<ArrayList<String>>>();
ArrayList<ArrayList<ArrayList<String>>> lemmas=data.lemmatize(text,chunks);
chunks.trimToSize();
FileWriter fout=new FileWriter(target.getCanonicalPath()+".sgf");
BufferedWriter out=new BufferedWriter(fout);
ftext+="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
ftext+="<contextfile concordance=\"raw\">\n";
ftext+="\t<context filename=\""+target.getName()+"\" paras=\"yes\">\n";
int p=1;
int s;
String pos;
for(ArrayList<ArrayList<String>> paragraph:lemmas)
{
ftext+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
s=1;
ArrayList<ArrayList<String>> oparagraph=chunks.get(p-1);
for(ArrayList<String> sentence:paragraph)
{
ArrayList<String> osentence=oparagraph.get(s-1);
ftext+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
int w=0;
for(String word:sentence)
{
pos=word.substring(word.length()-1);
String oword=osentence.get(w);
if(Dictionary.isPunctuation(pos))
{
ftext+="\t\t\t\t<punc>"+oword+"</punc>\n";
}
else
{
if(Dictionary.getPOS(pos)>=0)
{
ftext+="\t\t\t\t<wf cmd=\"tag\" pos=\""+pos+"\" lemma=\""+word.substring(0,word.length()-2)+"\" wnsn=\"0\">"+oword+"</wf>\n";
}
else
{
ftext+="\t\t\t\t<wf cmd=\"ignore\" pos=\""+pos+"\">"+oword+"</wf>\n";
}
}
w++;
}
ftext+="\t\t\t</s>\n";
s++;
}
ftext+="\t\t</p>\n";
p++;
}
ftext+="\t</context>\n";
ftext+="</contextfile>\n";
out.write(ftext);
out.close();
fout.close();
in.close();
fin.close();
return ftext;
}
/**
* Modifies a target sgf file by adding new wnsn attributes generated with a WSD algorithm.
* @param file Target file name.
* @param ds Decisions for modifying the wnsn attributes.
*/
public static void modifyAndSaveSGF(String file, ArrayList<Decision> ds) throws Exception{
int index=0;
int idx;
String line;
String text="";
FileReader fin=new FileReader(file);
BufferedReader in=new BufferedReader(fin);
while((line=in.readLine())!=null)
{
idx=line.indexOf("cmd=\"tag");
if(idx>=0)
{
idx=line.indexOf("wnsn=\"");
text+=line.substring(0,idx);
text+="wnsn=\"";
Decision d=ds.get(index);
if(d.isAttempted())
{
int wnsns[]=d.getAnswers();
if(wnsns.length>0)
{
text+=String.valueOf(wnsns[0]+1);
for(int i=1;i<wnsns.length;i++)
{
text+=";"+String.valueOf(wnsns[i]+1);
}
}
else
{
text+="0";
}
}
else
{
text+="0";
}
index++;
text+=line.substring(idx+7);
}
else
{
text+=line;
}
text+="\n";
}
fin.close();
in.close();
FileWriter fout=new FileWriter(file);
BufferedWriter out=new BufferedWriter(fout);
out.write(text);
out.close();
fout.close();
}
}