Package gannuNLP.dictionaries

Source Code of gannuNLP.dictionaries.WordNet

package gannuNLP.dictionaries;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.regex.Pattern;
import gannuNLP.data.Lemma;
import gannuNLP.data.Relation;
import gannuNLP.data.Sense;
import gannuUtil.KeyArray;
import gannuUtil.KeyString;


/**
* A simple connector to WordNet. Before using it you must set the path with WordNet.setPath(String). WordNet.loadDataBase(String) its the first method that should be executed.
* This connector is a little slow to load because of the counts for IDF calculation.
* The valid versions of WordNet are 3.0, 2.1 and 1.7. Others can be added by using dictLoader command.  
* @author Francisco Viveros-Jiménez
*
*/
public class WordNet extends Dictionary{
  /**
   *
   */
  private static final long serialVersionUID = 1L;

  public WordNet(){
    super();
    this.usesPOSTag=true;
    this.isWeb=false;
    this.path="Resources/WordNet 3.0/";
    this.name="WordNet ";
    this.language="en";
  }
 
  @Override
  public void loadCoreData() throws Exception {
  //Memory initialization
   
    ArrayList<FileReader> data=new ArrayList<FileReader>(4);   
    ArrayList<FileReader> index=new ArrayList<FileReader>(4);
    senseMaps=new ArrayList<KeyArray>(155290);
    senses=new ArrayList<Sense>(117660);
    this.loadIrregulars();
    glossCount=0.0;
   
    data.add(new FileReader(path+"/data.noun"));
    data.add(new FileReader(path+"/data.verb"));
    data.add(new FileReader(path+"/data.adj"));
    data.add(new FileReader(path+"/data.adv"));
   
    index.add(new FileReader(path+"/index.noun"));
    index.add(new FileReader(path+"/index.verb"));
    index.add(new FileReader(path+"/index.adj"));
    index.add(new FileReader(path+"/index.adv"));
 
    //load prepositions
    String line;
   
    prepositions=new ArrayList<String>(70);
    //System.out.println("Loading prepositions");
    FileReader f=new FileReader("data/prepositions");
    BufferedReader in=new BufferedReader(f);
    line=in.readLine();
    while(!(line==null))
    {
      prepositions.add(line);
      line=in.readLine();
    }
    in.close();
    f.close();
    Collections.sort(prepositions);
   
    int i=0;
       
    String tokens[];
    //Loading sense mapping
   
    //System.out.println("Loading sense maps");
    maxCollocationSize=0;   
    for(FileReader inf:index)
    {
      String pos="_"+getPOS(i);
      in=new BufferedReader(inf);
      line=in.readLine();
      while(!(line==null))
      {
        if(!line.startsWith(" "))
        {
          tokens=line.split(" ");
          int x=tokens[0].split("_").length;
          if(x>maxCollocationSize)
            maxCollocationSize=x;
          senseMaps.add(new KeyArray(tokens[0]+pos, parseSenseMap(tokens,pos)));
        }
        line=in.readLine();
      }
      in.close();
      inf.close();
      i++;
    }
    Collections.sort(senseMaps);
    //Loading glosses
    i=0;
   
    System.out.println("Loading dictionary (wait some minutes please!)");
    for(FileReader inf:data)
    {     
      in=new BufferedReader(inf);
      line=in.readLine();
      while(!(line==null))
      {
        if(!line.startsWith("  "))
        {
            senses.add(parseGloss(line,i));
        }
        line=in.readLine()
      }
     
      in.close();
      inf.close();
      i++;
    }
    glossCount+=(double)senses.size();   
    Collections.sort(senses)
    data.clear();   
    index.clear()
  }
  @Override
  public void load(String sampleSources) throws Exception {
    loadCoreData();
    //Load samples & counts
    wordCounts=new ArrayList<KeyString>();
    String sources[]=sampleSources.split(";");
    this.wordCount=0.0;
    for(String source:sources)
    {
      try
      {
        //System.out.println("Loading "+source+" samples");
        FileReader in=new FileReader("Resources/"+this.name+"/samples/"+source);
        loadSamplesFromSource(in);   
        in.close();
        //System.out.println("Loading "+source+" counts");
        in=new FileReader("Resources/"+this.name+"/counts/"+source);
        loadCountsFromFile(in);
      }
      catch(Exception e)
      {
        System.out.println("Error: Must parse "+source+" first!");
      }
     
    }
    System.out.println("Setting word counts for IDF calculation");
    wordCounts.trimToSize();
    Collections.sort(wordCounts);
  }
  public void changeSenseId() throws Exception
  {
    //Change all the synsetIds to senseIds
    //First the relations
    for(Sense sense:this.senses)
    {
      for(ArrayList<Relation> rels:sense.getRelations().values())
      {
        for(Relation rel:rels)
        {
          Sense s=this.getSense(rel.getSid()+"_"+rel.getPos());
          ArrayList<Sense> ss=new ArrayList<Sense>();
          int j=-1;
          while(ss.size()==0)
          {
             j++;
               ss=this.getSenses(s.getSynonyms().get(j));
          }
          int i;
          for(i=0;i<ss.size();i++)
          {
            if(ss.get(i).equals(s))
              break;
          }
          rel.setSid(s.getSynonyms().get(j)+"@"+String.valueOf(i));
        }
      }
    }
    ArrayList<String> c=new ArrayList<String>();
    ArrayList<Sense> newlist=new ArrayList<Sense>(this.senses.size());
    ArrayList<KeyString> equivalents=new ArrayList<KeyString>(this.senses.size());
    c.add("all");
    for(Sense sense:this.senses)
    {
      ArrayList<Sense> ss=new ArrayList<Sense>();
      int j=-1;
      while(ss.size()==0)
      {
        j++;
        ss=this.getSenses(sense.getSynonyms().get(j));
      }
     
      int i;
      for(i=0;i<ss.size();i++)
      {
        if(ss.get(i).equals(sense))
          break;
      }
     
      Sense s=new Sense(sense,c);
      s.setSid(sense.getSynonyms().get(j)+"@"+String.valueOf(i));
      newlist.add(s);
      equivalents.add(new KeyString(sense.getSid(),s.getSid()));
    }
    Collections.sort(newlist);
    Collections.sort(equivalents);
    this.senses=null;
    this.senses=newlist;
    for(KeyArray k:this.senseMaps)
    {
      for(int j=0;j<k.getArray().size();j++)
      {
        int index=Collections.binarySearch(equivalents, new KeyString(k.getArray().get(j)));
        k.getArray().set(j, equivalents.get(index).getString());
      }
    }
  }
  @Override
  public void parseSamplesFromDictionary() throws Exception {
    ArrayList<KeyString> gcounts=new ArrayList<KeyString>();
    ArrayList<KeyString> scounts=new ArrayList<KeyString>()
    ArrayList<FileReader> data=new ArrayList<FileReader>(4);
    data.add(new FileReader(path+"/data.noun"));
    data.add(new FileReader(path+"/data.verb"));
    data.add(new FileReader(path+"/data.adj"));
    data.add(new FileReader(path+"/data.adv"));
    System.out.println("Loading glosses and creating parsed files");
    File f=new File("./Resources/"+this.name+"/samples/");
    f.mkdirs();
    f=new File("./Resources/"+this.name+"/counts/");
    f.mkdirs();
    f=null;
    FileWriter writeg=new FileWriter("Resources/"+this.name+"/samples/Glosses");
    FileWriter writes=new FileWriter("Resources/"+this.name+"/samples/Samples");
    BufferedWriter outg=new BufferedWriter(writeg);
    BufferedWriter outs=new BufferedWriter(writes);
    FileWriter writecg=new FileWriter("Resources/"+this.name+"/counts/Glosses");
    FileWriter writecs=new FileWriter("Resources/"+this.name+"/counts/Samples");
    BufferedWriter outcg=new BufferedWriter(writecg);
    BufferedWriter outcs=new BufferedWriter(writecs);
   
    int i=0;
    int z=0;
    for(FileReader inf:data)
    {
     
      BufferedReader in=new BufferedReader(inf);
      String line=in.readLine();
      String pos=getPOS(i);
      String lineout;
      while(line!=null)
      {
        if(!line.startsWith("  "))
        {
          z++;
          if(z%1000==0)
            System.out.println("Parsing WordNet Synset "+z);
          String [] tokens=line.split("\\|");
          String sid=tokens[0].split(" ")[0]+"_"+pos;
          String []samples=tokens[1].split("\"");
          String gloss=samples[0];
          lineout=sid+"|"+gloss+"|";
          ArrayList<ArrayList<String>> parsedGloss=lemmatize(gloss);
          if(parsedGloss.size()==0)
             parsedGloss=this.POSTagging(gloss);
          for(ArrayList<String> lemmas:parsedGloss)
            for(String lemma:lemmas)
            {
              //String lemma=lemmas.get(0);
              lineout+=lemma+" ";
              int j=gcounts.indexOf(new KeyString(lemma));
              if(j<0)
              {
                gcounts.add(new KeyString(lemma,"1"));
              }
              else
              {
                KeyString ks=gcounts.get(j);
                ks.setString(String.valueOf(Integer.parseInt(ks.getString())+1));
              }
            }
         
          lineout=lineout.substring(0,lineout.length()-1)+"\n";
          outg.write(lineout);
          for(int k=1;k<samples.length;k++)
          {
            if(samples[k].split(" ").length>2)
            {
              String sample=samples[k];
              lineout=sid+"|"+sample+"|";
              ArrayList<ArrayList<String>> parsedSample=lemmatize(sample);
              for(ArrayList<String> lemmas:parsedSample)
                for(String lemma:lemmas)
                {
                  lineout+=lemma+" ";
                  int j=scounts.indexOf(new KeyString(lemma));
                  if(j<0)
                  {
                    scounts.add(new KeyString(lemma,"1"));
                  }
                  else
                  {
                    KeyString ks=scounts.get(j);
                    ks.setString(String.valueOf(Integer.parseInt(ks.getString())+1));
                  }
                }
              lineout=lineout.substring(0,lineout.length()-1)+"\n";
              outs.write(lineout);
            }
          }
          //Parsing the gloss or the samples
        }
        line=in.readLine()
      }
      in.close();
      inf.close();
      i++;
    }
    for(KeyString ks:gcounts)
    {
      outcg.write(ks.getKey()+"|"+ks.getString()+"\n");
    }
    for(KeyString ks:scounts)
    {
      outcs.write(ks.getKey()+"|"+ks.getString()+"\n");
    }
    outcg.close();
    outcs.close();
    writecs.close();
    writecg.close();
    outg.close();
    outs.close();
    writes.close();
    writeg.close();   
 

/**
* Method for extracting the possible synsets of a lemma.
* @param tokens Array containing values of line.split(" ") operation of a 
* index file line. POS WordNet data file.
* @param pos The POS tag of the current file.
* @return An ArrayList with the corresponding synsets of a lemma.
*/
  private static ArrayList<String> parseSenseMap(String[] tokens, String pos) {
    int i=4+Integer.parseInt(tokens[3]);
    int limit=Integer.parseInt(tokens[i]);
    i+=2;
    ArrayList<String>aux=new ArrayList<String>(limit);
    for(int z=0;z<limit;z++)
    {
      aux.add(tokens[i]+pos);
      i++;
    }
    return aux;
  }
  /**
   * Method for extracting a synset from a line of WordNet data. POS WordNet file.
   * @param line The line to be processed.
   * @param pos The POS tag of the WordNet file.
   * @return A synset object.
   */
  private Sense parseGloss(String line, int pos) {
    String gloss[]=line.split("\\|");
    String tokens[]= gloss[0].split(" ");
    String postag="_"+getPOS(pos);
    int i=4;
    int limit=Integer.valueOf(tokens[3],16);
    ArrayList<String> synonyms=new ArrayList<String>(limit);
    for(int z=0;z<limit;z++)
    {
      synonyms.add(tokens[i].replaceAll(Pattern.quote("(")+"[a-zA-Z]+"+Pattern.quote(")"), "")+postag);
      i+=2;
    }
   
   
    Sense ps=new Sense(tokens[0],getPOS(pos),synonyms);
    //Adding relations
    limit=Integer.parseInt(tokens[i]);
    i++;
    for(int z=0;z<limit;z++)
    {
      String WNType=tokens[i];
      String type="";
      if(WNType.equals("!"))
        type="Antonym";
      if(WNType.equals("@"))
        type="Hypernym";
      if(WNType.equals("@i"))
        type="Instance Hypernym";
      if(WNType.equals("~"))
        type="Hyponym";
      if(WNType.equals("~i"))
        type="Instance Hyponym";
      if(WNType.equals("#m"))
        type="Member Holonym";
      if(WNType.equals("#s"))
        type="Substance Holonym";
      if(WNType.equals("#p"))
        type="Part Holonym";
      if(WNType.equals("%m"))
        type="Member Meronym";
      if(WNType.equals("%s"))
        type="Substance meronym";
      if(WNType.equals("%p"))
          type="Part meronym";
      if(WNType.equals("="))
        type="Attribute";
      if(WNType.equals("+"))
        type="Derivationally related form";
      if(WNType.equals(";c"))
        type="Domain of synset";
      if(WNType.equals("-c"))
        type="Member of this domain";
      if(WNType.equals(";r"))
        type="Region of synset";
      if(WNType.equals("-r"))
        type="Member of this Region";
      if(WNType.equals(";u"))
        type="Usage of synset";
      if(WNType.equals("-u"))
        type="Member of this Usage";
      if(WNType.equals("*"))
        type="Entailment";
      if(WNType.equals(">"))
        type="Cause";
      if(WNType.equals("^"))
        type="Also see";
      if(WNType.equals("$"))
        type="Verb group";
      if(WNType.equals("&"))
        type="Similar to";
      if(WNType.equals("<"))
        type="Participle of verb";
      if(WNType.equals("\\"))
        type="Pertainym";

       ps.addRelation(tokens[i], new Relation(type,tokens[i+1],tokens[i+2]));
      i+=4;
    }
    return ps;
  }

  /**
   * Write all the loaded data into SuperLemma files.
   * SuperLemma files act as cache files for all the dictionaries.
   */
  public void WriteSuperLemmas(String path)throws Exception
  {
    this.changeSenseId();
    super.WriteSuperLemmas(path);
    }

  @Override
  public Lemma getLemma(String lemma) throws Exception {
        ArrayList<Sense> s=this.getSenses(lemma);
        Lemma l=null;
        if(s.size()>0)
           l=new Lemma(lemma,s.get(0).getPos(),s,this.getCounts(lemma),this.name);
    return l;
  }

  @Override
  public void setVersion(String version) {
    this.name+=version;   
  }

  @Override
  public boolean doesLemmaExists(String lemma) throws Exception{   
    return Collections.binarySearch(senseMaps,lemma)>=0;
  }
 
}
TOP

Related Classes of gannuNLP.dictionaries.WordNet

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.