package gannuWSD;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.util.ArrayList;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.filter.ElementFilter;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.dictionaries.DataBroker;
import gannuUtil.Util;
/**
* Command for cleaning SemCor files written in previous XML formats.
* Usage:<br/>
* java -cp "gannu.jar" gannuWSD.SemCorCleaner dictionaryClass Version target [tag] [rewrite] [fix]<br/>
* You can add the following flags:<br/>
* <ul>
* <li>tag: allows you to continue marking on the gold standard.
* Words with cmd="tag" in the original file are open-class words which were not disambiguated manually. </li>
* <li>rewrite: rewrite previously .sgf files generated.</li>
* <li>fix: turns autofix feature on. If not you have to correct any warning by yourself. Some sample
* warnings are: bad correct sense number, lemma unavailable in dictionary, etc.</li>
* </ul>
* @author Francisco Viveros-Jiménez
*
*/
public class SemCorCleaner {
/**
* Flag for the autofix feature.
*/
public static boolean fix;
/**
* Method that prompts the user for completing missing data of a word.
* @param word Target word in its XML node format.
* @param l Corresponding lemma.
* @throws Exception
*/
public static void checkSenses(Element word,Lemma l)throws Exception
{
BufferedReader kb = new BufferedReader(new InputStreamReader(System.in));
String wnsn=word.getAttributeValue("wnsn");
while(!SemCorCleaner.isValidWNSN(wnsn, l.getSenses().size()))
{
System.out.println("\nBad wnsn attribute for <"+word.getValue()+"> please select one");
int j=1;
if(l.getSenses().size()>1)
{
if(!SemCorCleaner.fix)
{
for(Sense s:l.getSenses())
{
System.out.println(String.valueOf(j)+":"+s.getSamples().toString());
j++;
}
System.out.println("Please write the valid sense number(s) separated with ; or pick zero for choosing other sense");
wnsn=kb.readLine();
}
else
{
System.out.println("Marking "+word.getValue()+" for future tagging ");
word.setAttribute("cmd","tag");
break;
}
}
else
{
wnsn="1";
System.out.println("Auto-fix of single sense lemma");
}
}
word.setAttribute("wnsn",wnsn);
}
/**
* Displays words in a 7 words radius around the current word.
* @param words Words in the document.
* @param w Current word.
*/
public static void displayWindow(ArrayList<Element> words,int w)
{
//Print a window
int window=14;
int a;
for(a=(int)-Math.floor(window/2);a<0;a++)
{
if((a+w)>=0)
{
System.out.print(words.get(a+w).getValue()+" ");
window--;
}
}
System.out.print("<"+words.get(w).getValue()+"> ");
a=1;
while((window>0)&&(a+w<words.size()))
{
System.out.print(words.get(a+w).getValue()+" ");
window--;
a++;
}
}
/**
* Method for checking if a lemma exists in the dictionary.
* Prompts the user when the lemma is unavailable or use the auto-fix if specified.
* @param dic Base dictionary.
* @param lemma Target lemma.
* @param word Original wf XML node.
* @param pos POS tag.
* @throws Exception
*/
public static void readLemma(DataBroker dic,String lemma,Element word,String pos)throws Exception
{
String lem=lemma+"_"+pos;
Lemma l=null;
ArrayList<Sense> senses=dic.getSenses(lem);
BufferedReader kb = new BufferedReader(new InputStreamReader(System.in));
if(senses.size()==0)//
{
if(!SemCorCleaner.fix)
{
System.out.println("Lemma not found: "+lem);
System.out.println("Please enter a lemma or no for ignoring this word!");
String plemma=kb.readLine();
if(plemma.equals("no"))
{
word.setAttribute("cmd","ignore");
}
else
{
l=dic.getLemma(plemma);
while(l==null)
{
System.out.println("Please write the valid form (lemma_P where lemma is in normal form and P is the first letter of the pos tag)");
plemma=kb.readLine();
l=dic.getLemma(plemma);
}
word.setAttribute("lemma",l.getLemma().substring(0,l.getLemma().length()-2));
word.setAttribute("pos",l.getPos());
word.setAttribute("wnsn","-1");
}
}
else
{
System.out.println("Marking "+word.getValue()+" for future tagging ");
word.setAttribute("cmd","tag");
}
}
else
{
System.out.println("\n<"+word.getValue()+"> may not be in normal form");
ArrayList<String> morphs=dic.Morphy(lemma, pos);
do
{
String plemma;
if(morphs.size()==1)
{
plemma=morphs.get(0)+"_"+pos;
System.out.println(lemma+" was autocorrected to "+ plemma);
}
else
{
if(!SemCorCleaner.fix)
{
System.out.println("Please select the valid form (lemma_P where lemma is in normal form and P is the first letter of the pos tag).\nSome suggestions are "+morphs+" just remember to add the _"+pos+" pos tagat the end of the lemma");
plemma=kb.readLine();
}
else
{
System.out.println("Marking "+word.getValue()+" for future tagging ");
word.setAttribute("cmd","tag");
break;
}
}
l=dic.getLemma(plemma);
}
while(l==null&&!SemCorCleaner.fix);
if(l!=null)
{
word.setAttribute("lemma",l.getLemma().substring(0,l.getLemma().length()-2));
word.setAttribute("pos",l.getPos());
word.setAttribute("wnsn","-1");
}
}
}
/**
* Tells is wnsn attribute is valid.
* @param wnsn Target wnsn attribute.
* @param senses Polysemy of the target word.
* @return True when having a valid wnsn attribute.
*/
public static boolean isValidWNSN(String wnsn,int senses)
{
if(wnsn.equals("U"))
return true;
for(String sense:wnsn.split(";"))
{
int s=Integer.parseInt(sense);
if(s<=0||s>senses)
return false;
}
return true;
}
/**
* @param args
*/
public static void main(String[] args) throws Exception{
if(args.length<3)
{
System.out.println("Usage: java -cp\"gannu.jar\" gannuWSD.SemCorCleaner dictionaryClass Version target [tag] [rewrite] [fix] for repairing/cleaning the semcor files!");
}
else
{
System.out.println("Repairing file(s) syntax please wait!");
boolean rewrite=false;
boolean tag=false;
SemCorCleaner.fix=false;
for(int i=3;i<args.length;i++)
{
if(args[i].equals("tag"))
tag=true;
if(args[i].equals("rewrite"))
rewrite=true;
if(args[i].equals("fix"))
SemCorCleaner.fix=true;
}
ArrayList<File> files=Util.getAllFiles(new File(args[2]));
DataBroker dic=new DataBroker(args[0],args[1]);
dic.setPath(".");
dic.load("Glosses");
int x=1;
for(File file:files)
{
File fout=new File(file.getCanonicalFile()+".sgf");
if(!file.getName().endsWith(".sgf")&&(!fout.exists()||rewrite))
{
System.out.println("Repairing file: "+file.getName()+ " "+String.valueOf(x)+"/"+String.valueOf(files.size()));
x++;
FileReader f=new FileReader(file);
BufferedReader in=new BufferedReader(f);
FileWriter fo=new FileWriter(file.getCanonicalFile()+".sgf");
BufferedWriter out=new BufferedWriter(fo);
String line=in.readLine();
while(line!=null)
{
String sout="";
line=line.replace("&", "&");
String tokens[]=line.split("=");
for(int i=0;i<tokens.length;i++)
{
if(i==0)
{
sout+=tokens[i];
}
else
{
int j=tokens[i].indexOf(">");
if(j<0)
j=tokens[i].indexOf(" ");
sout+="=\""+tokens[i].substring(0, j)+"\""+tokens[i].substring(j);
}
}
out.write(sout+"\n");
line=in.readLine();
}
in.close();
f.close();
out.close();
fo.close();
}
}
System.out.println("Repairing bad lemmas!");
File target=new File(args[2]);
if(!target.isDirectory())
target=new File(args[2]+".sgf");
files=Util.getAllSGFFiles(target);
x=1;
for(File file:files)
{
System.out.println("-----------Checking file: "+file.getName()+ " "+String.valueOf(x)+"/"+String.valueOf(files.size())+"--------------");
x++;
try
{
SAXBuilder builder=new SAXBuilder();
Document xml=(Document)builder.build(file);
ArrayList<Element> words=new ArrayList<Element>();
for(Element word:xml.getDescendants(new ElementFilter("wf")))
words.add(word);
for(int w=0;w<words.size();w++)
{
Element word=words.get(w);
if(word.getAttribute("ot")!=null)
{
word.setAttribute("cmd", "ignore");
}
if((word.getAttribute("cmd").getValue().equals("done"))||(tag&&(word.getAttribute("cmd").getValue().equals("tag"))))
{
String lemma=word.getValue();
String pos=word.getAttribute("pos").getValue().substring(0,1);
String lem;
Lemma l=null;
if(word.getAttribute("cmd").getValue().equals("done"))
{
if(word.getAttribute("lemma")!=null)
{
lemma=word.getAttribute("lemma").getValue();
pos=word.getAttribute("pos").getValue().substring(0,1);
lem=lemma+"_"+pos;
l=dic.getLemma(lem);
}
else
{
lemma="";
pos="X";
}
}
lem=lemma+"_"+pos;
if(l==null)//lemma may not be in normal form
{
SemCorCleaner.displayWindow(words, w);
SemCorCleaner.readLemma(dic, lemma, word, pos);
l=dic.getLemma(word.getAttributeValue("lemma")+"_"+word.getAttributeValue("pos").substring(0,1));
}
if(l!=null)
{
if(!SemCorCleaner.isValidWNSN(word.getAttributeValue("wnsn"), l.getSenses().size()))
{
SemCorCleaner.displayWindow(words, w);
SemCorCleaner.checkSenses(word, l);
}
}
}
}
XMLOutputter xmlOutput = new XMLOutputter();
//display nice
xmlOutput.setFormat(Format.getPrettyFormat());
xmlOutput.output(xml, new FileWriter(file));
System.out.println("------------Correction finished for "+file.getName()+"-------------------------");
}
catch(JDOMException e)
{
file.deleteOnExit();
}
}
}
}
}