Package gannuWSD

Source Code of gannuWSD.SemCorCleaner

package gannuWSD;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.util.ArrayList;


import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.filter.ElementFilter;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;

import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.dictionaries.DataBroker;
import gannuUtil.Util;

/**
* Command for cleaning SemCor files written in previous XML formats.
* Usage:<br/>
* java -cp "gannu.jar" gannuWSD.SemCorCleaner dictionaryClass Version target [tag] [rewrite] [fix]<br/>
* You can add the following flags:<br/>
* <ul>
* <li>tag: allows you to continue marking on the gold standard.
* Words with cmd="tag" in the original file are open-class words which were not disambiguated manually. </li>
* <li>rewrite: rewrite previously .sgf files generated.</li>
* <li>fix: turns autofix feature on. If not you have to correct any warning by yourself. Some sample
*  warnings are: bad correct sense number, lemma unavailable in dictionary, etc.</li>
* </ul>
* @author Francisco Viveros-Jim&eacute;nez
*
*/
public class SemCorCleaner {
  /**
   * Flag for the autofix feature.
   */
  public static boolean fix;
  /**
   * Method that prompts the user for completing missing data of a word.
   * @param word Target word in its XML node format.
   * @param l Corresponding lemma.
   * @throws Exception
   */
  public static void checkSenses(Element word,Lemma l)throws Exception
  {
    BufferedReader kb = new BufferedReader(new InputStreamReader(System.in));
    String wnsn=word.getAttributeValue("wnsn");
    while(!SemCorCleaner.isValidWNSN(wnsn, l.getSenses().size()))
    {
      System.out.println("\nBad wnsn attribute for <"+word.getValue()+"> please select one");
      int j=1;
      if(l.getSenses().size()>1)
      {
        if(!SemCorCleaner.fix)
        {
          for(Sense s:l.getSenses())
          {
            System.out.println(String.valueOf(j)+":"+s.getSamples().toString());
            j++;
          }
          System.out.println("Please write the valid sense number(s) separated with ; or pick zero for choosing other sense");
          wnsn=kb.readLine();
        }
        else
        {
          System.out.println("Marking "+word.getValue()+" for future tagging ");
          word.setAttribute("cmd","tag");
          break;
        }
       
      }
      else
      {
        wnsn="1";
        System.out.println("Auto-fix of single sense lemma");
      }
    }
    word.setAttribute("wnsn",wnsn);
 
  }
  /**
   * Displays words in a 7 words radius around the current word.
   * @param words Words in the document.
   * @param w Current word.
   */
  public static void displayWindow(ArrayList<Element> words,int w)
  {
    //Print a window
    int window=14;
    int a;
    for(a=(int)-Math.floor(window/2);a<0;a++)
    {
      if((a+w)>=0)
      {
        System.out.print(words.get(a+w).getValue()+" ");
        window--;
      }
    }
    System.out.print("<"+words.get(w).getValue()+"> ");
    a=1;
    while((window>0)&&(a+w<words.size()))
    {
      System.out.print(words.get(a+w).getValue()+" ");
      window--;
      a++;
    }
  }
  /**
   * Method for checking if a lemma exists in the dictionary.
   * Prompts the user when the lemma is unavailable or use the auto-fix if specified.
   * @param dic Base dictionary.
   * @param lemma Target lemma.
   * @param word Original wf XML node.
   * @param pos POS tag.
   * @throws Exception
   */
  public static void readLemma(DataBroker dic,String lemma,Element word,String pos)throws Exception
  {
    String lem=lemma+"_"+pos;
    Lemma l=null;
    ArrayList<Sense> senses=dic.getSenses(lem);
    BufferedReader kb = new BufferedReader(new InputStreamReader(System.in));
    if(senses.size()==0)//
    {
      if(!SemCorCleaner.fix)
      {
        System.out.println("Lemma not found: "+lem);
        System.out.println("Please enter a lemma or no for ignoring this word!");
        String plemma=kb.readLine();
        if(plemma.equals("no"))
        {
          word.setAttribute("cmd","ignore");
        }
        else
        {
          l=dic.getLemma(plemma);
          while(l==null)
          {
            System.out.println("Please write the valid form (lemma_P where lemma is in normal form and P is the first letter of the pos tag)");
            plemma=kb.readLine();
            l=dic.getLemma(plemma);
          }       
          word.setAttribute("lemma",l.getLemma().substring(0,l.getLemma().length()-2));
          word.setAttribute("pos",l.getPos());
          word.setAttribute("wnsn","-1");
        }
      }
      else
      {
        System.out.println("Marking "+word.getValue()+" for future tagging ");
        word.setAttribute("cmd","tag");
      }
    }
    else
    {
      System.out.println("\n<"+word.getValue()+"> may not be in normal form");
      ArrayList<String> morphs=dic.Morphy(lemma, pos);
      do
      {     
        String plemma;
        if(morphs.size()==1)
        {
          plemma=morphs.get(0)+"_"+pos;
          System.out.println(lemma+" was autocorrected to "+ plemma);
        }
        else
        {
          if(!SemCorCleaner.fix)
          {
            System.out.println("Please select the valid form (lemma_P where lemma is in normal form and P is the first letter of the pos tag).\nSome suggestions are "+morphs+" just remember to add the _"+pos+" pos tagat the end of the lemma");
            plemma=kb.readLine();
          }
          else
          {
            System.out.println("Marking "+word.getValue()+" for future tagging ");
            word.setAttribute("cmd","tag");
            break;
          }
        }
        l=dic.getLemma(plemma);
      }
      while(l==null&&!SemCorCleaner.fix);
      if(l!=null)
      {
        word.setAttribute("lemma",l.getLemma().substring(0,l.getLemma().length()-2));
        word.setAttribute("pos",l.getPos());
        word.setAttribute("wnsn","-1");
      }
    }
  }
  /**
   * Tells is wnsn attribute is valid.
   * @param wnsn Target wnsn attribute.
   * @param senses Polysemy of the target word.
   * @return True when having a valid wnsn attribute.
   */
  public static boolean isValidWNSN(String wnsn,int senses)
  {
    if(wnsn.equals("U"))
      return true;
    for(String sense:wnsn.split(";"))
    {
      int s=Integer.parseInt(sense);
      if(s<=0||s>senses)
        return false;
    }
    return true;
  }
  /**
   * @param args
   */
  public static void main(String[] args) throws Exception{
    if(args.length<3)
    {
      System.out.println("Usage: java -cp\"gannu.jar\" gannuWSD.SemCorCleaner dictionaryClass Version target [tag] [rewrite] [fix] for repairing/cleaning the semcor files!");
    }
    else
    {
      System.out.println("Repairing file(s) syntax please wait!");
      boolean rewrite=false;
      boolean tag=false;
      SemCorCleaner.fix=false;
      for(int i=3;i<args.length;i++)
      {
        if(args[i].equals("tag"))
          tag=true;
        if(args[i].equals("rewrite"))
          rewrite=true;
        if(args[i].equals("fix"))
          SemCorCleaner.fix=true;
      }
     
      ArrayList<File> files=Util.getAllFiles(new File(args[2]));
      DataBroker dic=new DataBroker(args[0],args[1]);
      dic.setPath(".");
      dic.load("Glosses");
      int x=1;
      for(File file:files)
      {
        File fout=new File(file.getCanonicalFile()+".sgf");
        if(!file.getName().endsWith(".sgf")&&(!fout.exists()||rewrite))
        {
          System.out.println("Repairing file: "+file.getName()+ " "+String.valueOf(x)+"/"+String.valueOf(files.size()));
          x++;
          FileReader f=new FileReader(file);
          BufferedReader in=new BufferedReader(f);
          FileWriter fo=new FileWriter(file.getCanonicalFile()+".sgf");
          BufferedWriter out=new BufferedWriter(fo);
          String line=in.readLine();
          while(line!=null)
          {
            String sout="";
            line=line.replace("&", "&amp;");
            String tokens[]=line.split("=");
            for(int i=0;i<tokens.length;i++)
            {
              if(i==0)
              {
                sout+=tokens[i];
             
              else
              {
                int j=tokens[i].indexOf(">");
                if(j<0)
                  j=tokens[i].indexOf(" ");
                sout+="=\""+tokens[i].substring(0, j)+"\""+tokens[i].substring(j);
              }
            }
            out.write(sout+"\n");
            line=in.readLine();         
          }
          in.close();
          f.close();
          out.close();
          fo.close();
        }
      }
      System.out.println("Repairing bad lemmas!");
      File target=new File(args[2]);
      if(!target.isDirectory())
        target=new File(args[2]+".sgf");
      files=Util.getAllSGFFiles(target);
      x=1;
      for(File file:files)
      {
        System.out.println("-----------Checking file: "+file.getName()+ " "+String.valueOf(x)+"/"+String.valueOf(files.size())+"--------------");
        x++;
        try
        {
          SAXBuilder builder=new SAXBuilder();
          Document xml=(Document)builder.build(file);
          ArrayList<Element> words=new ArrayList<Element>();
          for(Element word:xml.getDescendants(new ElementFilter("wf")))
            words.add(word);
          for(int w=0;w<words.size();w++)
          {
            Element word=words.get(w);
            if(word.getAttribute("ot")!=null)
            {
              word.setAttribute("cmd", "ignore");
            }
            if((word.getAttribute("cmd").getValue().equals("done"))||(tag&&(word.getAttribute("cmd").getValue().equals("tag"))))
            {
              String lemma=word.getValue();
              String pos=word.getAttribute("pos").getValue().substring(0,1);
              String lem;
              Lemma l=null;
              if(word.getAttribute("cmd").getValue().equals("done"))
                { 
                if(word.getAttribute("lemma")!=null)
                {
                  lemma=word.getAttribute("lemma").getValue();
                    pos=word.getAttribute("pos").getValue().substring(0,1);
                    lem=lemma+"_"+pos;
                    l=dic.getLemma(lem);                 
                }
                else
                {
                  lemma="";
                  pos="X";
                }
                }
              lem=lemma+"_"+pos;
 
             
              if(l==null)//lemma may not be in normal form
              {
               
                SemCorCleaner.displayWindow(words, w);
                SemCorCleaner.readLemma(dic, lemma, word, pos);
                l=dic.getLemma(word.getAttributeValue("lemma")+"_"+word.getAttributeValue("pos").substring(0,1));
              }
              if(l!=null)
              {
                if(!SemCorCleaner.isValidWNSN(word.getAttributeValue("wnsn"), l.getSenses().size()))
                {
                  SemCorCleaner.displayWindow(words, w);
                  SemCorCleaner.checkSenses(word, l);
                }
              }
            }         
          }
          XMLOutputter xmlOutput = new XMLOutputter();
          //display nice
          xmlOutput.setFormat(Format.getPrettyFormat());
          xmlOutput.output(xml, new FileWriter(file));
          System.out.println("------------Correction finished for "+file.getName()+"-------------------------");
        }
        catch(JDOMException e)
        {
          file.deleteOnExit();
        }
      }     
    }
  }

}
TOP

Related Classes of gannuWSD.SemCorCleaner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.