Package gannuNLP.dictionaries

Source Code of gannuNLP.dictionaries.Dictionary

package gannuNLP.dictionaries;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import gannuNLP.data.Count;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.data.SuperLemma;
import gannuUtil.KeyArray;
import gannuUtil.KeyString;
import gannuUtil.Util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;


/**
* Template class for creating dictionary connectors.
* @author Francisco Viveros-Jiménez
*
*/
public abstract class Dictionary implements Serializable{
  /**
   * Unique instance of the POS tagger. It should be initialized using the setTagger method.
   */
  static MaxentTagger tagger;
  /**
   * Loads the corresponding Stanford POS tagger for this dictionary.
   */
  void setTagger() throws Exception
  {
    if(Dictionary.tagger==null)
    {
      if(this.language.equals("en"))
        Dictionary.tagger=new MaxentTagger("./data/taggermodels/"+this.language+"/english-left3words-distsim.tagger");
      if(this.language.equals("es"))
      {
        //TODO
      }
       
    }   
  }
  /**
   * Flag for telling if the dictionary uses POSTag.
   */
  boolean usesPOSTag;
  /**
   *
   * @return this.usesPOSTag
   */
  public boolean itUsesPOSTag()
  {
    return this.usesPOSTag;
  }
  /**
   * String for identifying the language. Please use the exact same code as Wikipedia.
   * E.g. "en" for English or "es" for Spanish.
   */
 
  String language;
  /**
   *
   * @return this.language
   */
  public String getLanguage()
  {
    return language;
  }
  /**
   * Method for configuring the version/language of the dictionary.
   * @param version The version/language of the dictionary.
   */
  public abstract void setVersion(String version);
  /**
   * @return This dictionary's name with modifiers.
   */
  public String getCompleteName()
  {
    return name;
  }
  /**
   *
   */
  private static final long serialVersionUID = 1L;
  /**
   * Modifies the name of this dictionary.s
   * @param name The new name of this dictionary.
   */
  public void setName(String name)
  {
    this.name=name;
  }
  /**
   *
   * @return this.name
   */
  public String getName() {
    return name;
  }
  /**
   *
   * @return this.path
   */
  public String getPath() {
    return path;
  }
  /**
   * Tells is the dictionary is a Web dictionary.
   */
  protected boolean isWeb;
  /**
   * Dictionary name
   */
  protected String name;
  /**
   * The path of dictionary resources
   */
  protected String path;
  /**
   * Contains the frequency of each lemma on the loaded samples.
   */
  protected ArrayList<KeyString> wordCounts;
  /**
   * Contains the mapping between a lemma and its possible senses. Lemmas have
   * the format lemma_P, where P is the POStag (N, V, A, R). This list its always sorted by the lemma field.
   */
  protected ArrayList<KeyArray> senseMaps;
  /**
   * Contains dictionary senses with lemmatized definitions. Lemmatization was done
   * using methods provided in this class. This list allows searching by a senseId.
   */
  protected ArrayList<Sense> senses;
  /**
   * Contains memory map for irregular morphs. Morphs are stored in the following format:
   * ("irregular_morp","base_morp1[base_morphN]*"). exceptions its sorted
   * by irregular_morph.
   */
  protected ArrayList<KeyString> exceptions;

 
  /**
   * Contains the list of prepositions stored in the preposition file. Prepositions
   *  are used for detecting some verb collocations in Morphy. preposition its sorted.
   */
  protected ArrayList<String> prepositions;
  /**
   * Number of loaded definitions/samples.
   */
  protected double glossCount;
  /**
   * Number of words contained in the definitions/samples
   */
  protected double wordCount;
  /**
   * Maximum word size from collocations stored in the dictionary.
   */
  protected int maxCollocationSize;
  /**
   * Returns lemma/senses memory mapping.
   * @return synMaps
   */
  public ArrayList<KeyArray> getSynMaps() {
    return this.senseMaps;
  }
  /**
   * Returns glosses list.
   * @return senses
   */
  public ArrayList<Sense> getSenses() {
    return this.senses;
  }
  /**
   * Returns list with irregular morphs.
   * @return exceptions
   */
  public ArrayList<KeyString> getExceptions() {
    return this.exceptions;
  }
  /**
   * Tell us if the dictionary is loaded from the Web.
   * @return True if the dictionary is loaded from the Web.
   */
  public boolean isWeb()
  {
    return this.isWeb;
  }
 
  /**
   * Returns a list with the prepositions.
   * @return prepositions
   */
  public ArrayList<String> getPrepositions() {
    return this.prepositions;
  }
  /**
   * Return a count of the senses in dictionary.
   * @return glossCount
   */
  public double getGlossCount() {
    return glossCount;
  }
  /**
   * Returns the maximum word size from collocations stored in dictionary.
   * @return maxCollocationSize
   */
  public int getMaxCollocationSize() {
    return maxCollocationSize;
  }
  /**
   * Retrieve a sense by its synsetId using binary search over glosses mapping.
   * @param sid The synsetId in format "Number_P".
   * @return The corresponding sense object.
   * @throws Exception
   */
  public Sense getSense(String sid) throws Exception
  {
    return senses.get(Collections.binarySearch(senses, sid));
  }
  /**
   * Retrieve IDF for a lemma. Lemmas are in the format "lemma_P".
   * IDF is calculated by using each Sense as a document.
   * Note that even by loading <a href="http://www.cse.unt.edu/~rada/downloads.html">SemCor</a>" the samples will be added
   * to a single Sense object. This means that samples are attached to its
   * corresponding Sense.
   * @param lemma The lemma to look for.
   * @return A double with the calculated IDF value
   * @throws Exception
   */
  public double getIDF(String lemma) throws Exception
  {
    int i=Collections.binarySearch(wordCounts,lemma);
    if(i>=0)
      return Math.log((double)glossCount/(1.0+Integer.parseInt(wordCounts.get(i).getString())));
    else
      return Math.log((double)glossCount);
  }
  /**
   * loadCoreData exists for allowing to parse new <a href="http://www.cse.unt.edu/~rada/downloads.html">SemCor</a>" files.
   * loadData loads sense information, mappings and relations.
   * However, it does not load glosses and samples.
   * After, executing loadCoreData you can execute the method parseSamplesFromSemCor(String) for adding a new sample corpus.
   * @throws Exception
   */
  public abstract void loadCoreData()throws Exception;
  /**
   * Reads files in the path folder and creates
   * the memory mapping for all the terms in dictionary.
   * This method loads the samples for the bag of words.
   * Each dictionary has 2 different sample sources:
   * (1)Glosses (dictionary definitions),
   * (2)Samples (dictionary samples).
   * In addition, there exists corresponding corpus for some dictionaries, like (SemCor for WordNet 3.0).
   * Corpus are classified by dictionary in the Resources folder.
   * You can load your own corpus if you parse them first with the method parseSamplesFromSemCor(String).
   * Samples should be in valid SemCor format.
   * @param sampleSources A string with the sources that will form the bag of words.
   * Some valid examples are: "Glosses", "Glosses;Samples", "Glosses;Samples;SemCor;yoursamplesource"
   * @throws Exception
   */
  public abstract void load(String sampleSources)throws Exception;/**
   * Loads a count file.
   * A count file contains how many times a word appears in the sample source.
   * @param input Dictionary/corpus to be loaded.
   * @throws Exception
   */
  public void loadCountsFromFile(FileReader input) throws Exception{
    BufferedReader in=new BufferedReader(input);
    String line=in.readLine();
    while(line!=null)
    {
      String tokens[]=line.split("\\|");
      int index=wordCounts.indexOf(new KeyString(tokens[0]));
      if(index>=0)
      {
        KeyString ks=wordCounts.get(index);
        ks.setString(String.valueOf(Integer.parseInt(ks.getString())+Integer.parseInt(tokens[1])));
      }
      else
        wordCounts.add(new KeyString(tokens[0],tokens[1]));
      line=in.readLine();
    }
    in.close();
   
  }
  /**
   * This method parses the samples and counts for a valid SemCor format source.
   * After using this method you can load the samples from it by loading the source name.
   * @param source The name of the file or the folder that contains SemCor valid format files.
   * An error will be raised if a no SemCor file is mixed in the source folder.
   * @throws Exception
   */
  public void parseSamplesFromSemCor(String source) throws Exception {
   
    File fs=new File(source);
    ArrayList<File> files=Util.getAllFiles(fs);
    ArrayList<KeyString> counts=new ArrayList<KeyString>();
    FileWriter writers=new FileWriter(path+"samples/"+fs.getName());
    FileWriter writerc=new FileWriter(path+"counts/"+fs.getName());
    BufferedWriter outs=new BufferedWriter(writers);
    BufferedWriter outc=new BufferedWriter(writerc);
    for(File file:files)
    {
      DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = fact.newDocumentBuilder();
      Document test=builder.parse(file);
      NodeList sentences=test.getElementsByTagName("s");
     
      //Extracting lemmas from each sentence
      for(int i=0;i<sentences.getLength();i++)
      {
        System.out.println("Parsing "+ file.getName() +" sentence "+(i+1));
        Element sentence=(Element)sentences.item(i);
        String sample=" ";
        ArrayList<String> lemmas=new ArrayList<String>(20);
        ArrayList<String> sids=new ArrayList<String>(20);
       
        NodeList words=sentence.getElementsByTagName("wf");
        for(int j=0;j<words.getLength();j++)
        {
          Element word=(Element)words.item(j);
          sample+=word.getChildNodes().item(0).getNodeValue()+" ";
          if(word.getAttribute("cmd").equals("done")&&word.getAttribute("lemma").length()>0)
          {
            String pos=word.getAttribute("pos").substring(0,1).toUpperCase();
            if(pos.equals("W"))
              pos="R";
            lemmas.add(word.getAttribute("lemma")+"_"+pos);
            ArrayList<Sense> senses=getSenses(word.getAttribute("lemma")+"_"+pos);
            if(senses.size()>0)
            {
              String []correct=word.getAttribute("wnsn").split(";");
              for(String c:correct)
              {
                if(c.equals("U"))
                {
                  for(Sense sense:senses)
                    sids.add(sense.getSid());
                }
                else
                {
                  int index=Integer.parseInt(c)-1;
                  if(index<senses.size()&&index>=0)
                  {
                    sids.add(senses.get(index).getSid());
                  }
                }
              }
            }
          }
        }
       
        sample=sample.substring(0, sample.length()-1);
        String outline="";
        for(String sid:sids)
        {
          outline=sid+"|"+sample+"|";
          for(String lemma:lemmas)
          {
            outline+=lemma+" ";
          }
          outline=outline.substring(0, outline.length()-1);
          outs.write(outline+"\n");
        }
        for(String lemma:lemmas)
        {
          KeyString ks=new KeyString(lemma,"1");
          int index=counts.indexOf(ks);
          if(index>=0)
          {
            ks=counts.get(index);
            ks.setString(String.valueOf(Integer.parseInt(ks.getString())+1));
          }
          else
          {
            counts.add(ks);
          }
        }
      }
    }
    for(KeyString ks:counts)
    {
      outc.write(ks.getKey()+"|"+ks.getString()+"\n")
    }
    outs.close();
    outc.close();
    writers.close();
    writerc.close();
  }
 
  /**
   * Utility method for parsing dictionary glosses and samples.
   * Try to avoid its usage, unless you modify the dictionary or
   * erase the files in samples and counts resource folders.
   * @throws Exception
   */
  public abstract void parseSamplesFromDictionary() throws Exception;
  /**
   * Load the samples from a parsed source.
   * @param input The loaded source.
   * @throws Exception
   */
  public void loadSamplesFromSource(FileReader input) throws Exception
  {
    BufferedReader in=new BufferedReader(input);
    String line=in.readLine();
    while(line!=null)
    {
      String tokens[]=line.split("\\|");
      Sense ps=getSense(tokens[0]);
      if(tokens.length>2)
      {
        ps.addBagOfWords(tokens[1], tokens[2].split(" "),this.name);
        this.wordCount+=(double)(tokens[2].split(" ").length);
      }
      else
        ps.addBagOfWords(tokens[1], new String[]{""},this.name);
      line=in.readLine();
    }
    in.close();
  }
  /**
   * This method its similar to wn command of WordNet. getLemma uses a lemma in
   * the format "lemma_P". First, base forms are retrieved with Morphy. Then,
   * senses are retrieved for the corresponding base forms.
   * @param lemma The lemma to look for.
   * @return An ArrayList with the senses for the lemma. An empty ArrayList will be
   * returned if the lemma wasn't found.
   * @throws Exception
   */
  public ArrayList<Sense> getSenses(String lemma) throws Exception
  {
    String postag=lemma.substring(lemma.length()-1);
    if(postag.equals("W"))
      postag="R";
    int pos=getPOS(postag);
    postag="_"+postag;
    ArrayList<String>lemmas=Morphy(lemma.substring(0,lemma.length()-2), pos);
    ArrayList<Sense> senses=new ArrayList<Sense>(4);
    if(pos>=0)
    {
     
      for(String lemmata:lemmas)
      {
        try
        {
          ArrayList<String> sids=senseMaps.get(Collections.binarySearch(senseMaps,lemmata+postag)).getArray();
          for(String sid:sids)
          {
           senses.add(this.senses.get(Collections.binarySearch(this.senses, sid)));
          }
        }
        catch(Exception e)
        {
         
        }
      }
     
    }
    //if(senses.size()==0)
    //  System.out.println(lemma+" not found");
    return senses;
  }
  /**
   * Implementation of a lexical processor.
   * @param morph The word to be processed.
   * @param postag The POS tag of the word ("N","V","A","R").
   * @return A list with the possible corresponding lemmas found in dictionary.
   * @throws Exception
   */
  public ArrayList<String> Morphy(String morph) throws Exception
  {
    if(this.usesPOSTag)
    {
      return Morphy(morph.substring(0,morph.length()-2),morph.substring(morph.length()-1));
    }
    else
    {
      return Morphy(morph,"");
    }
   
  }
  /**
   * Implementation of a lexical processor.
   * @param morph The word to be processed.
   * @param postag The POS tag of the word ("N","V","A","R").
   * @return A list with the possible corresponding lemmas found in dictionary.
   * @throws Exception
   */
  public ArrayList<String> Morphy(String morph, String postag) throws Exception
  {
    return Morphy(morph,getPOS(postag));
  }
  /**
   * Implementation of a lexical processor.
   * @param morph The word to be processed.
   * @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
   * @return A list with the possible corresponding lemmas found in dictionary.
   * @throws Exception
   */
  public ArrayList<String> Morphy(String morph, int pos) throws Exception
  {
    ArrayList<String> lemmas=new ArrayList<String>(5);
    String postag;
    if(this.usesPOSTag)
      postag="_"+getPOS(pos);
    else
      postag="";
    if(pos>=0||!this.usesPOSTag)
    {
      //transform the morph with simple transformation rules
      ArrayList<String> lemmatas=Transform(morph,pos);
      lemmatas.addAll(Transform(morph.toLowerCase(),pos));
      for(String lemmata:lemmatas)
      {
        if(this.doesLemmaExists(lemmata+postag))
        {
          if(!lemmas.contains(lemmata))
            lemmas.add(lemmata);
        }
     
      }
     
    }
    return lemmas;
  }
  /**
   * Method that leads to the execution of the proper morphological unit generator.
   * Currently, is available for the English language.
   * @param morph The word to be processed.
   * @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
   * @return A list with the possible base forms for the word.
   * The List could contain duplicates and invalid words.
   * @throws Exception
   */
  public ArrayList<String> Transform(String morph, int pos) throws Exception
  {
    if(this.language.equals("en"))
    {
      return this.EnglishMorphy(morph, pos);
    }
    return new ArrayList<String>();   
  }

  /**
   * Morphological unit generator for English language.
   * @param morph The word to be processed.
   * @param pos The POS tag of the word ("N=0","V=1","A=2","R=3").
   * @return A list with the possible base forms for the word.
   * The List could contain duplicates and invalid words.
   * @throws Exception
   */
private ArrayList<String> EnglishMorphy(String morph, int pos) throws Exception{
  ArrayList<String> lemmas=new ArrayList<String>(8);
  //include the morph as it
  lemmas.add(morph);
  switch(pos)
  {
    case 0:
      if(morph.contains("_"))
      {
        String split[]=morph.split("_");
        if(split.length==2)
        {
          ArrayList<String> aux=Morphy(split[0], 0);
          ArrayList<String> aux2=Morphy(split[1], 0);
          for(String x:aux)
            for(String y:aux2)
              lemmas.add(x+"_"+y);
          aux=Morphy(split[0], 0);
          aux2=Morphy(split[1], 1);
          for(String x:aux)
            for(String y:aux2)
              lemmas.add(x+"_"+y);
          aux=Morphy(split[0], 2);
          aux2=Morphy(split[1], 0);
          for(String x:aux)
            for(String y:aux2)
              lemmas.add(x+"_"+y);
        }
      }
      if(morph.endsWith("ful"))
      {
        ArrayList<String> aux=Morphy(morph.substring(0,morph.length()-3), pos);
        for(String x:aux)
          lemmas.add(x+"ful");
      }
      if(morph.endsWith("ses")||morph.endsWith("xes")||morph.endsWith("zes")||morph.endsWith("ches")||morph.endsWith("shes"))
      {
        lemmas.add(morph.substring(0, morph.length()-2));
      }
      if(morph.endsWith("ies"))
      {
          lemmas.add(morph.substring(0, morph.length()-3)+"y");
      }
      if(morph.endsWith("s"))
      {
        lemmas.add(morph.substring(0, morph.length()-1));
     
      if(morph.endsWith("men"))
      {
        lemmas.add(morph.substring(0, morph.length()-2)+"an");
      }
     
      break;
    case 1:
      if(morph.contains("_"))
      {
        if(hasPrepositions(morph))
        {
          String split[]=morph.split("_");
          ArrayList<String> aux=Morphy(split[0], 1);
          ArrayList<String> aux2=Morphy(split[split.length-1], 0);
          if(aux.size()==0)
            aux.add(split[0]);
          if(aux2.size()==0)
            aux2.add(split[split.length-1]);
          for(String x:aux)
            for(String y:aux2)
            {
              String t=x;
              for(int i=1;i<split.length-1;i++)
                t+="_"+split[i];
              t+="_"+y;
              lemmas.add(t);
            }
        }
        else
        {
          String split[]=morph.split("_");
          if(split.length==2)
          {
            ArrayList<String> aux=Morphy(split[0], 1);
            ArrayList<String> aux2=Morphy(split[1], 0);
            for(String x:aux)
              for(String y:aux2)
                lemmas.add(x+"_"+y);
          }
        }
      }
      if(morph.endsWith("es")||morph.endsWith("ed"))
      {
        lemmas.add(morph.substring(0,morph.length()-1));
        lemmas.add(morph.substring(0,morph.length()-2));
      }
      if(morph.endsWith("ies"))
      {
        lemmas.add(morph.substring(0, morph.length()-3)+"y");
      }
      if(morph.endsWith("s"))
      {
        lemmas.add(morph.substring(0, morph.length()-1));
     
      if(morph.endsWith("ing"))
      {
        lemmas.add(morph.substring(0,morph.length()-3)+"e");
        lemmas.add(morph.substring(0,morph.length()-3));
      }
      break;
    case 2:
      if(morph.endsWith("er"))
      {
        lemmas.add(morph.substring(0,morph.length()-1));
        lemmas.add(morph.substring(0,morph.length()-2));
      }
      if(morph.endsWith("est"))
      {
        lemmas.add(morph.substring(0,morph.length()-2));
        lemmas.add(morph.substring(0,morph.length()-3));
      }
      break;
   
  }
  this.loadIrregulars();
  int i=Collections.binarySearch(exceptions, morph+"_"+Dictionary.getPOS(pos));   
  if(i>-1)
  {
    String lemmatas2[]=exceptions.get(i).getString().split(";");
    for(String lemmata:lemmatas2)
      if(!lemmas.contains(lemmata))
        lemmas.add(lemmata);
  }
  return lemmas;
}
void loadIrregulars() throws Exception{
  if(this.exceptions==null)
  {
    if(this.language.equals("en"))
    {
      this.loadEnglishIrregulars();
    }
    if(this.language.equals("es"))
    {
      //TODO
    }
  } 
}

void loadEnglishIrregulars()throws Exception
{
  //Loading exception list
  System.out.println("Loading exception list")
  exceptions=new ArrayList<KeyString>(5960);
  ArrayList<FileReader> exc=new ArrayList<FileReader>(4);
  exc.add(new FileReader("data/exceptions/en/noun.exc"));
  exc.add(new FileReader("data/exceptions/en/verb.exc"));
  exc.add(new FileReader("data/exceptions/en/adj.exc"));
  exc.add(new FileReader("data/exceptions/en/adv.exc"));
  String line;
  int i=0;
  String tokens[];
  int j;
  for(FileReader inf:exc)
  {
    String pos="_"+getPOS(i);
    BufferedReader in=new BufferedReader(inf);
    line=in.readLine();
    while(!(line==null))
    {
      tokens=line.split(" ");
      for(j=2;j<tokens.length;j++)
        tokens[1]+=";"+tokens[j];
      exceptions.add(new KeyString(tokens[0]+pos,tokens[1]))
      line=in.readLine();
    }
    in.close();
    inf.close();
    i++;
  }
  Collections.sort(exceptions);
  exc.clear();
}
/**
* Method for detecting if a collocation has a preposition in it.
* @param morph The collocation to process. White spaces must be replaced with "_".
* @return true if the collocation has a preposition in it.
*/

  public boolean hasPrepositions(String morph) {
    String split[]=morph.split("_");
    for(int i=1;i<split.length-1;i++)
      if(prepositions.contains(split[i]))
        return true;
    return false;
  }

  /**
   * Returns the corresponding POS tag
   * @param pos  The POS tag of the word ("N=0","V=1","A=2","R=3").
   * @return The POS tag. An empty string if pos was not a valid number
   */
  public static String getPOS(int pos) {
    String postag="";
    switch(pos)
    {
      case 0:
        postag+="N";
        break;
      case 1:
        postag+="V";
        break;
      case 2:
        postag+="J";
        break;
      case 3:
        postag+="R";
        break;
    }
    return postag;
  }
  /**
   * Lemmatizer that uses Morphy as morphological processor
   * and Stanford's Part-Of-Speech Tagger.
   * @param line The text to be processed.
   * @return An ArrayList that contains an ArrayList of the possible lemmas of each word.
   * Most of the words have one corresponding lemma in WordNet, but, there are some exceptions. For
   *  example: axes_N-> (axis_N, ax_N, axe_N).
   * @throws Exception
   */
  public ArrayList<ArrayList<String>> lemmatize(String line) throws Exception
 
    this.setTagger();
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(line));   
    ArrayList<TaggedWord> tSentence= new ArrayList<TaggedWord>();
      for (List<HasWord> sentence : sentences) {
         tSentence.addAll(Dictionary.tagger.tagSentence(sentence));
      }
      ArrayList<ArrayList<String>> aux=new ArrayList<ArrayList<String>>(tSentence.size());
      ArrayList<String> temp;
      ArrayList<String> lemmas;
      String comp,pos;
      int i;
      //Morphy eater
      while(tSentence.size()>0)
      {
        comp="";
        pos=tSentence.get(0).tag().substring(0, 1);
        //try to eat the most tokens as possible
       
        for(i=tSentence.size()>maxCollocationSize?maxCollocationSize:tSentence.size();i>0;i--)
        {
            comp=tSentence.get(0).word().toLowerCase();
            for(int j=1;j<i;j++)
            {
              comp+="_"+tSentence.get(j).word().toLowerCase();
            }
            temp=Morphy(comp,pos);
            if(temp.size()==0 && i>1)//try for the last element value
            {
              temp=Morphy(comp,tSentence.get(i-1).tag().substring(0, 1));
              if(temp.size()>0)
                pos=tSentence.get(i-1).tag().substring(0, 1);
            }
           
             //temp must contain something for eat something
            if(temp.size()>0)
            {
              lemmas= new ArrayList<String>();
                for(String lemma:temp)
                  if(!lemmas.contains(lemma+"_"+pos))
                    lemmas.add(lemma+"_"+pos);               
                aux.add(lemmas);
                temp=null;
                break;
            }   
        }
        //eat i tokens
        if(i==0)
          i++;
        while(i>0)
        {
          tSentence.remove(0);
          i--;
        }
      }
    return aux;
  }
  /**
   * Open-class words extracted with the
   * Part-Of-Speech Tagger.
   * This lemmatizer returns an unlemmatized list of tagged words.
   * @param line The text to be processed.
   * @param tagger An instance of the tagger.
   * @return An ArrayList that contains the tagged words.
   * @throws Exception
   */
  public ArrayList<ArrayList<String>> POSTagging(String line) throws Exception
  {
    this.setTagger();
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(line));
    ArrayList<TaggedWord> tSentence= new ArrayList<TaggedWord>();
      for (List<HasWord> sentence : sentences) {
         tSentence.addAll(Dictionary.tagger.tagSentence(sentence));
      }
      ArrayList<ArrayList<String>> aux=new ArrayList<ArrayList<String>>(tSentence.size());
      String pos;
      for(TaggedWord word:tSentence)
      {
        pos=word.tag().substring(0, 1);
        if(getPOS(pos)>=0)
        {
          ArrayList<String> x=new ArrayList<String>(1);
          x.add(word.word()+"_"+pos);
          aux.add(x);
        }
      }
    return aux;
  }
  /**
   * Returns the corresponding POS tag
   * @param pos  The POS tag of the word ("N=0","V=1","A=2","R=3","W=3").
   * @return The POS tag. -1 if pos its an invalid tag.
   */
  public static int getPOS(String pos) {
    int postag=-1;
    if(pos.equals("N"))
      postag=0;
    if(pos.equals("V"))
      postag=1;
    if(pos.equals("J"))
      postag=2;
    if(pos.equals("R")||pos.equals("W"))
      postag=3;
    return postag;
  }
  /**
   * Sets the path/URL of the source dictionary.
   * @param path
   */
  public void setPath(String path)
  {
    this.path=path;
  }
  @Override
  public String toString()
  {
    return name;
  }
  /**
   * Write all the loaded data into SuperLemma files.
   * SuperLemma files act as cache files for all the dictionaries.
   */
  public void WriteSuperLemmas(String path)throws Exception
 
    int i=0;
    FileWriter f=new FileWriter(path+this.name+".sta");
    BufferedWriter out=new BufferedWriter(f);
    out.write(String.valueOf(this.glossCount)+"\n");
    out.write(String.valueOf(this.wordCount)+"\n");
    out.close();
    f.close();
    for(KeyArray Key:this.senseMaps)
    {
      String lemma=Key.getKey();
      ArrayList<Sense> senses=this.getSenses(lemma);
      ArrayList<Count> counts=this.getCounts(lemma);
      SuperLemma s=this.loadSuperLemma(lemma,path);
      Lemma lemmaO=new Lemma(lemma,senses.get(0).getPos(),senses,counts,this.name);
      s.addLemma(lemmaO);
      this.WriteSuperLemma(path,s);
      i++;
      if(i%1000==0)
      {
        System.out.println("Saving slm file "+String.valueOf(i)+"/"+String.valueOf(this.senseMaps.size()));
      }
    }
  }
  /**
   * Returns the frequency of a word in the dictionary.
   * @param lemma The target word.
   * @return The frequency is stored inside an ArrayList<Count> of size=1.
   */
  public ArrayList<Count> getCounts(String lemma) {
    int i=Collections.binarySearch(wordCounts,lemma);
    ArrayList<Count> counts=new ArrayList<Count>();
    Count c;
    if(i>=0)
      c=new Count(Double.parseDouble(wordCounts.get(i).getString()),this.name);
    else
      c=new Count(0.0,this.name);
    counts.add(c);
    return counts;
  }
  /**
   * Loads a super lemma from an slm file in the data folder.
   * @param lemma The super lemma to be loaded.
   * @return The super lemma loaded from the file or a new super lemma.
   */
  public static String normalizeLemmaforFile(String lemma)
  {
    return lemma.replace("/", "+++").replace("?", "@@@").replace("\"", "@@@@").replace(":","@@@@@").replace(">", "@@@@@@").replace("*","xxxxxx");
  }
  /**
   * Loads a SuperLemma from a file.
   * @param lemma The target lemma.
   * @param path The path containing this superlemma.
   * @return The corresponding SuperLemma if any. Otherwise, it returns a new empty SuperLemma.
   * @throws Exception
   */
  public SuperLemma loadSuperLemma(String lemma,String path) throws Exception{
    File ft=new File(path+Dictionary.normalizeLemmaforFile(lemma)+".slm");
    SuperLemma s;
    if(ft.exists())
    {
      s=(SuperLemma)Util.loadObject(ft);
    }
    else
    {
      s=new SuperLemma(lemma);
    }
    return s;
  }
  /**
   * Writes a file containing the target super lemma.
   * @param superlemma The target super lemma.
   */
  public void WriteSuperLemma(String path, SuperLemma superlemma) throws Exception{   
    Util.writeObject(new File(path+Dictionary.normalizeLemmaforFile(superlemma.getLemma())+".slm"), superlemma);
  }
  /**
   * Returns the number of words in the definitions/samples.
   * @return The number of words in the definitions/samples.
   */
  public double getWordCount()
  {
    return this.wordCount;
 
  /**
   * Lemmatizes a full text returning lemmatized paragraphs. It automatically
   * does sentence splitting and lemmatization.
   * Lemmas not found in the dictionary are keeped in its base form.
   * The prefix "###" is added for identifying unlemmatized open-class words.
   * Powered by Stanford POS tagger.
   * @param text Target text.
   * @param chunks Empty ArrayList for storing the original sentence in chunks having the same structure as the return value.
   * @return An ArrayList having paragraphs as elements. Each paragraph (Element) contains
   * a set of sentences. Each sentence contains a set of words.
   */
  public ArrayList<ArrayList<ArrayList<String>>> lemmatize(ArrayList<String> text,ArrayList<ArrayList<ArrayList<String>>> chunks) throws Exception{
    ArrayList<ArrayList<ArrayList<String>>> ptext=new ArrayList<ArrayList<ArrayList<String>>>(text.size());     
    Lemma l=null;
    this.setTagger();
    for(String p:text)
    {
      List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(p));
      ArrayList<ArrayList<String>> paragraph=new ArrayList<ArrayList<String>>(sentences.size());
      ArrayList<ArrayList<String>> cparagraph=new ArrayList<ArrayList<String>>(sentences.size());
      ArrayList<TaggedWord> tSentence= new ArrayList<TaggedWord>();
      for(List<HasWord> sentence:sentences)
      {
        tSentence.clear();       
        tSentence.addAll(Dictionary.tagger.tagSentence(sentence));
        ArrayList<String> psentence=new ArrayList<String>(tSentence.size());
        ArrayList<String> csentence=new ArrayList<String>(tSentence.size());
        String lemmata,lemma,pos;
        for(TaggedWord word:tSentence)
        {
          pos=word.tag().substring(0,1).toUpperCase();
          lemma=word.word();
          csentence.add(word.word());
          if(this.usesPOSTag)
          {
            lemmata=lemma;
            lemma=lemma+"_"+pos;
            if(pos.equals("N")||pos.equals("J")||pos.equals("R")||pos.equals("V"))
            {
              ArrayList<String>lemmatas=this.Morphy(lemmata, pos);
              for(String plemmata:lemmatas)
              {
                l=this.getLemma(plemmata+"_"+pos);
                if(l!=null)
                {
                  lemma=plemmata+"_"+pos;
                  break;
                }
              }           
            }
          }
          else
          {                       
            l=this.getLemma(lemma);           
          }         
          psentence.add(lemma);
          l=null;
        }
        paragraph.add(psentence)
        cparagraph.add(csentence);
      }
      ptext.add(paragraph);
      chunks.add(cparagraph);
    }
    return ptext;
  }

  /**
   * Returns a Lemma object for the corresponding lemma.
   * @param lemma The target lemma.
   * @return The corresponding lemma object or null when the target lemma does not exists in the dictionary.
   * @throws Exception
   */
  public abstract Lemma getLemma(String lemma) throws Exception;
  /**
   *
   * @param lemmaObject
   * @return Corresponding IDF value
   */
  public double getIDF(Lemma lemmaObject) {
    double w=1.0;
    for(Count c:lemmaObject.counts)
    {
      w+=c.getFrequency();
    }   
    return Math.log(this.glossCount/w);
  }
  public static boolean isPunctuation(String pos) {
    return pos.matches("\\W");
  }
  /**
   * Method that tells if a target lemma has entries in this dictionary.
   * @param lemma Target lemma.
   * @return True, if an entry exists corresponding to the target lemma.
   */
  public abstract boolean doesLemmaExists(String lemma) throws Exception;
}
TOP

Related Classes of gannuNLP.dictionaries.Dictionary

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.