Package gannuNLP.dictionaries

Source Code of gannuNLP.dictionaries.DataBroker

package gannuNLP.dictionaries;

import gannuNLP.data.Count;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.data.SuperLemma;
import gannuUtil.Util;
import gannuWSD.DataLoader;
import gannuWSD.bowmodifiers.BoWModifier;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;



/**
* DataBroker exists for accessing all the available dictionaries through a single object.
* DataBroker has the following features: <br/>
* (1) It maintains a cache of SuperLemma objects for avoiding loading entire off-line dictionaries
* and accessing the online ones.<br/>
* (2) It maintains a cache of lemma objects for avoiding recalculation of the BoWs
*  made by BoWModifier objects.<br/>
* (3) It allows you to load samples from previously processed sample sources into the BoW.
* SuperLemma files allows a faster access to the dictionary/corpus data for each lemma.<br/>
* (4) It saves a good amount of memory usage at the cost of some hard drive storage.
* Therefore, DataBroker is the main way for accessing data from dictionaries and sample sources.
*  
* @author Francisco Viveros-Jim&eacute;nez
*
*/
public class DataBroker extends Dictionary{
  /**
   * Method for setting the base URL of web dictionaries.
   */
  public void setBaseURL(String URL)
  {
    if(this.isWeb)
    {
      this.source.setPath(URL);
    }
  }
  private static final long serialVersionUID = 1L;
  /**
   * Active BoWModifiers being used in this session.
   */
  ArrayList<BoWModifier> modifiers;
 
  /**
   * Base dictionary.
   */
  Dictionary source;
  /**
   * List of lemmas which IDF counts are already loaded in the cache.
   */
  ArrayList<String> lemmas;
  /**
   * Cache of counts for calculating IDF.
   */
  ArrayList<Double> counts;
  /**
   * List of loaded sample sources.
   */
  ArrayList<String> sampleSources;
  /**
   * Tells is the dictionary is ready to go.
   */
  boolean loaded;
 
  /**
   *
   * @return The base dictionary.
   */
  public Dictionary getSource() {
    return source;
  }

  /**
   * Creates a DataBroker for loading senses from an specified source dictionary.
   * @param Source Full class name of the source dictionary.
   */
  public DataBroker(String Source)throws Exception{
    super();
    this.modifiers=new ArrayList<BoWModifier>();
    this.loaded=false;
    this.source=(Dictionary)Class.forName(Source).newInstance();
    this.isWeb=this.source.isWeb();
    this.path=".";
    this.name=this.source.getName();
    this.glossCount=0;
    this.lemmas=new ArrayList<String>();
    this.counts=new ArrayList<Double>();
    this.usesPOSTag=this.source.usesPOSTag;
  }
 
 
  public DataBroker(String Source, String version) throws Exception{
    this.modifiers=new ArrayList<BoWModifier>();
    this.loaded=false;
    this.source=(Dictionary)Class.forName(Source).newInstance();
    this.source.setVersion(version);
    this.language=this.source.getLanguage();
    this.isWeb=this.source.isWeb();
    this.path=".";
    this.name=this.source.getName();
    this.glossCount=0;
    this.lemmas=new ArrayList<String>();
    this.counts=new ArrayList<Double>();
    this.usesPOSTag=this.source.usesPOSTag;
  }

  @Override
  /**
   * Loads the glossCount and wordCount data from the .sta files.
   * You should create the SuperLemmas first by executing targetDictionaryClass.WriteSuperLemmas() method.
   */
  public void loadCoreData() throws Exception {
  //Loading Statistics from dictionary
    try
    {
      FileReader f=new FileReader("./data/"+getName()+"/"+getName()+".sta");
      BufferedReader in=new BufferedReader(f);
      this.glossCount=Double.parseDouble(in.readLine());
      this.wordCount=Double.parseDouble(in.readLine());
      in.close();
      f.close();
   
    }
    catch(Exception e)
    {
      //  Dictionary not parsed, initialize it.
      if(!this.source.isWeb())
        throw new Exception("Error: Dictionary not loaded, use the java -jar dictLoader.jar connectorClass dictFilesPath GANNUpath command for parsing the dictionary first!");
    }
    if(this.source.isWeb())
    {     
      this.source.setVersion(this.language);
      this.source.setPath(this.path);
      this.source.loadCoreData();
      File aux=new File("./data/"+getName()+"/"+getName()+".sta");
      if(!aux.exists())
      {
        File f=new File("./data/"+getName()+"/");
        f.mkdirs();
        FileWriter fout=new FileWriter("./data/"+getName()+"/"+getName()+".sta");
        BufferedWriter out=new BufferedWriter(fout);     
        this.glossCount=this.source.getGlossCount();
        this.wordCount=this.source.getWordCount();
        out.write(String.valueOf(this.glossCount)+"\n");
        out.write(String.valueOf(this.wordCount)+"\n");
        out.close();
        fout.close();
      }
      aux=new File("./data/"+getName()+".sl");
      if(!aux.exists())
      {
        DataLoader.addSourceList(aux, getName());
      }
    }
  }
  @Override
  /**
   * Calls this.loadCoreData() and sets the valid sample sources.
   * @param sampleSources Valid sample sources.
   * Use Glosses for loading dictionary definitions and samples.
   * Use the corpus name for loading corpus samples.
   * The String should be written in the format "source1(,sourceN)*".
   */
  public void load(String sampleSources) throws Exception {
      loadCoreData();
      //  Set the corpus valid sources
      this.sampleSources=new ArrayList<String>(5);
      for(String source:sampleSources.split(","))
      {
        if(source.equals("Glosses"))
          this.sampleSources.add(this.name);
        else 
          this.sampleSources.add(source);
      }
  }
 
  @Override
  /**
   * Creates base files for loading a dictionary from zero.
   * It is recommended to use gannu.tools.dictLoader class instead.
   */
  public void parseSamplesFromDictionary() throws Exception {
    this.loadSource();
    this.source.parseSamplesFromDictionary();
  }
 
 
  /**
   * Loads a source dictionary if needed for doing operations other than queries
   * like lemmatization.
   * @throws Exception
   */
  private void loadSource() throws Exception
  {
    if(!this.loaded)
    {
      this.loaded=true;
      this.source.loadCoreData();
    }
  }
  /**
   * Retrieves the corresponding senses of a lemma extracted directly from the
   * base dictionary. This method is used for skipping the cache usage (not recommended).
   * @param lemma The target lemma.
   * @return An ArrayList containing the corresponding senses for the target Lemma.
   */
  public ArrayList<Sense> getSenses(String lemma) throws Exception
  {
    if(this.isWeb)
    {
      return this.getLemma(lemma).getSenses();
    }
    else
    {
      ArrayList<String> targetWords=new ArrayList<String>();
      ArrayList<String> posTags=new ArrayList<String>();
      if(this.usesPOSTag)
      {
        if(lemma.matches(".*_\\w"))
        {
          targetWords.add(lemma);
          posTags.add(lemma.substring(lemma.length()-2));
        }
        else
        {
          targetWords.add(lemma+"_N");
          targetWords.add(lemma+"_V");
          targetWords.add(lemma+"_J");
          targetWords.add(lemma+"_R");
          posTags.add("_N");
          posTags.add("_V");
          posTags.add("_J");
          posTags.add("_R");
        }
      }
      else
      {
        posTags.add("");
        targetWords.add(lemma);
      }
      ArrayList<String> lemmatas=new ArrayList<String>();
      ArrayList<Sense> senses=new ArrayList<Sense>();
      for(String plemma:targetWords)
      {       
        lemmatas.addAll(this.Morphy(plemma));       
      }
      for(String plemma:Util.removeDuplicates(lemmatas))
      {
        for(String posTag:posTags)
        {
          Lemma l=this.getLemma(plemma+posTag);
          if(l!=null)
          {
            for(Sense s:l.getSenses())
            {
              if(!senses.contains(s))
              {
                senses.add(s);
              }
            }
          }         
        }
       
      }
      return senses;
    }
   
  }
  /**
   * Adds a BoWModifier for this session.
   * @param mod The BoWModifier being added,
   */
  public void addModifier(BoWModifier mod)
  {
    this.modifiers.add(mod);
  }
  /**
   * Return the IDF value of a lemma.
   * This method creates a cache by using the this.lemmas
   * and this.counts lists for speeding up the process.
   * @param lemma The target lemma.
   * @return The IDF value. The minimum frequency of a lemma is 1 for avoiding calculation errors.
   */
  public double getIDF(String lemma)throws Exception
  {
    File tmp=new File("./data/idfs/"+this.getCompleteName().replace(">", "@@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".idf");   
    int index=this.lemmas.indexOf(lemma);
   
    double count=1.0;
    if(index<0)
    {
      if(tmp.exists())
      {
        count=((Double)Util.loadObject(tmp)).doubleValue();
        return Math.log((this.glossCount)/count);
     
      else
      {
        File dir=new File("./data/idfs/"+this.getCompleteName().replace(">", "@@@@@@")+"/");
        if(!dir.exists())
        {
          dir.mkdirs();
        }
      }
      File ft=new File(this.path+"/data/"+this.getName()+"/"+Dictionary.normalizeLemmaforFile(lemma)+".slm");
      if(ft.exists())
      {
        Lemma l=this.getLemmaNoModifiers(lemma);
        if(l!=null)         
          for(Count c:l.getCounts())
            count+=c.getFrequency();       
      }
      this.lemmas.add(lemma);
      this.counts.add(new Double(count));
    }
    else
    {
      count=this.counts.get(index).doubleValue();
    }
    try
    {
      Util.writeObject(tmp, new Double(count));
    }
    catch(Exception e)
    {
      System.out.print("!");
    }
    return Math.log((this.glossCount)/count);   
  }
  /**
   * Returns a Lemma without using the BoWModifiers active for this session.
   * @param lemma The target lemma.
   * @return The corresponding Lemma object.
   * @throws Exception
   */
  public Lemma getLemmaNoModifiers(String lemma)throws Exception
 
    File d=new File("./data/lemmas/"+this.getName()+"/");
    d.mkdirs();
    SuperLemma s=this.loadSuperLemma(lemma,"./data/"+this.getName()+"/");
    Lemma l=null;
    if(s.getLemmas().size()>0)
    {
      l=s.retrieveLemma(this.source.toString());
      if(l!=null)
      {
        l=new Lemma(l,this.sampleSources);                   
      }

    }
    if(l==null&&this.source.isWeb())
    {
      l=this.source.getLemma(lemma);
      if(l!=null)
      {
        s.addLemma(l);
        this.WriteSuperLemma("./data/"+this.getName()+"/",s);
      }
    }
    return l;
  }
  /**
   * @return Returns the following String: "this.name+"_"+this.sampleSources.toString()+"_"+this.modifiers.toString()".
   */
  public String getCompleteName()
  {
    String text="_[";
    for(String t:this.sampleSources)
    {
      if(t.length()>4)
      {
        text+=t.substring(0,4)+";";
      }
      else
      {
        text+=t+";";
      }
    }
    text+="][";
    for(BoWModifier bow:this.modifiers)
    {
      text+=bow.getName().substring(0,4);
      if(bow.getParams().size()>0)
        text+="#"+bow.getSimplifiedParameterString().replace("/","_");
      text+=";";
    }
    text+="]";
    return this.name+text;
   
  }
  @Override
  /**
   * Returns a Lemma from the dictionary after processing it with the active BoWModifiers.
   * This method uses the cache often and is the one that handles the /data/lemmas/ folder.
   * @param lemma The target lemma.
   * @return Lemma The corresponding Lemma object.
   */
  public Lemma getLemma(String lemma)throws Exception
  {
    //Search in cache   
    File f=new File("./data/lemmas/"+this.getCompleteName().replace(">", "@@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".glm");
    if(f.exists())
    {     
      return (Lemma) Util.loadObject(f);
    }
    else
    {
      File d=new File("./data/lemmas/"+this.getCompleteName().replace(">", "@@@@@@")+"/");     
      d.mkdirs();
      d=new File("./data/lemmas/"+this.getName()+"/");
      d.mkdirs();
    }
    SuperLemma s=this.loadSuperLemma(lemma,"./data/"+this.getName()+"/");
    Lemma l=null;
    if(s.getLemmas().size()>0)
    {
      l=s.retrieveLemma(this.source.toString());
      if(l!=null)
      {
        l=new Lemma(l,this.sampleSources);                   
      }

    }
    if(l==null&&this.source.isWeb())
    {
      l=this.source.getLemma(lemma);
      if(l!=null)
      {
        s.addLemma(l);
        this.WriteSuperLemma("./data/"+this.getName()+"/",s);
      }
    }
    if(l!=null)
    {
      for(BoWModifier mod:this.modifiers)
      {
        mod.modifyBow(l);
      }     
    }
   
    if(l!=null)
    {
      l.trim();
      Util.writeObject(f, l);
    }
    return l;
  }
  /**
   * @return A String containing the following: "super.toString()+":"+this.modifiers.toString()".
   */
  public String toString()
  {
    return super.toString()+":"+this.modifiers.toString();
  }
  /**
   * Returns the corresponding sense of a target SenseId.
   * @param sid The target SenseId.
   * @return The corresponding Sense object.
   */
  public Sense getSense(String sid)throws Exception
  {
    if(!this.isWeb)
    {
      String lemma=sid.split("@")[0];
      Lemma l=this.getLemmaNoModifiers(lemma);
      if(sid.split("@").length>1)
      {
        int senseNumber=Integer.parseInt(sid.split("@")[1]);
        return l.getSenses().get(senseNumber);
      }
      else
        return null;
    }
    else
    {     
      return this.source.getSense(sid);     
    }
  }

 
  @Override
  public void setVersion(String version) {
    this.language=version;
  }

  @Override
  public boolean doesLemmaExists(String lemma) throws Exception{
    if(!this.isWeb)
    {
      File f=new File("./data/lemmas/"+this.getCompleteName().replace(">", "@@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".glm");
      return f.exists();
    }
    else
    {
      return this.getLemma(lemma)!=null;
    }
  }

}
TOP

Related Classes of gannuNLP.dictionaries.DataBroker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.