Source Code of gannuNLP.corpus.WikiCorpus

package gannuNLP.corpus;


import gannuNLP.data.AmbiguousWord;
import gannuNLP.data.Input;
import gannuNLP.data.Lemma;
import gannuNLP.dictionaries.Dictionary;
import gannuNLP.dictionaries.Wiki;
import gannuUtil.Util;




import java.io.File;
import java.util.ArrayList;




/**
 * Class for extracting stats from wikipedia. 
 * @author Francisco Viveros-Jim&eacute;nez
 *
 */
public class WikiCorpus extends Corpus{


  
  /**
   * Instantiates a WikiCorpus for extracting statistics for all the lemmas of the target documents
   * @param targets List with containing the target documents.
   * @param name The name of this corpus.
   * @param dict Base dictionary. It has to be an instanceOf a Wiki connector.
   * @throws Exception
   */
  public WikiCorpus(ArrayList<Input> targets, String name, Dictionary dict) throws Exception
  {    
    super(dict,name);
    this.dict=dict;
    File tmp=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/");
    tmp.mkdirs();
    Wiki wiki=(Wiki)this.dict;
    for(Input doc:targets)
    {
      System.out.println("Calculating matrices for "+doc.toString());
      for(AmbiguousWord word:doc.getAmbiguousWords())
      {        
        if(word.getCurrentLemmaObject()!=null&&word.getSenses().size()>0&&word.getSenses().size()<150)
        {
          System.out.print(".");
          File fout=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/"+Dictionary.normalizeLemmaforFile(word.getLemma())+".gmx");;
          if(!fout.exists())
          {            
            SmallPair p=new SmallPair(word.getLemmaObject());            
            for(int i=0;i<p.getLemma().getSenses().size();i++)
            {
              p.getCounts()[i][i]=(int)wiki.getCounts(p.getLemma().getSenses().get(i),p.getLemma().getSenses().get(i));
              for(int j=0;j<p.getLemma().getSenses().size();j++)
              {
                if(p.getCounts()[i][i]<(2*p.getLemma().getSenses().size()))
                {
                  if(p.getCounts()[i][i]==0)
                  {
                    p.getCounts()[i][j]=1;
                    p.getCounts()[j][i]=1;  
                  }
                  else
                  {
                    p.getCounts()[i][j]=p.getCounts()[i][i];
                    p.getCounts()[j][i]=p.getCounts()[i][i];
                  }                  
                }
                else
                {
                  if(j>i)
                  {
                    p.getCounts()[i][j]=0;
                  }
                }                
              }
            }
          
            for(int i=0;i<p.getLemma().getSenses().size();i++)
            {
              for(int j=i+1;j<p.getLemma().getSenses().size();j++)
              {
                if(p.getCounts()[i][i]>=(2*p.getLemma().getSenses().size())&&p.getCounts()[j][j]>=(2*p.getLemma().getSenses().size()))
                {
                  p.getCounts()[i][j]=(int)wiki.getCounts(p.getLemma().getSenses().get(i),p.getLemma().getSenses().get(j));
                  p.getCounts()[j][i]=p.getCounts()[i][j];
                  if(p.getCounts()[j][i]>=(2*p.getLemma().getSenses().size()))
                  {
                    i=p.getLemma().getSenses().size();
                    break;
                  }
                }
              }
            }
            Util.writeObject(fout, p);
          }
        }
      }
      System.out.println();
    }
    
  }
  /**
   * Returns the sense number of the Sense having the higher frequency on the Wikipedia Dictionary.
   * @param lemma The target lemma.
   * @return The most frequent sense in the Wikipedia Dictionary.
   * @throws Exception 
   */
  public int getMFS(String lemma) throws Exception {
    File fout=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".gmx");
    SmallPair p=new SmallPair((Pair)Util.loadObject(fout));
    int sel=0;
    for(int i=1;i<p.getLemma().getSenses().size();i++)
    {
      if(p.getCounts()[sel][sel]<p.getCounts()[i][i])
      {
        sel=i;
      }
    }
    return sel;
  }
  /**
   * If you want to use Wikipedia as a corpus you should look into Wiki.createInputFromArticle.
   * This method does nothing. It exist only for avoiding its accidental call. 
   * @param documents Target document
   * @throws Exception 
   */
  public void transformBoW(Input document) throws Exception
  {
  }
  /**
   * Tells is the given lemma accomplishes one sense per discourse rule 
   * in the Wikipedia Dictionary.
   * @param lemma The target lemma.
   * @return True is the given lemma accomplishes the rule in this corpus.
   * @throws Exception 
   */
  public boolean isOneSensePerDiscourse(String lemma) throws Exception
  {
    File fout=new File("./data/matrices/"+dict.getCompleteName().replace(">", "@@@@@")+"/"+Dictionary.normalizeLemmaforFile(lemma)+".gmx");;
    if(fout.exists())
    {
      Object o=Util.loadObject(fout);
      SmallPair p;
      if(o instanceof Pair)
      {
        p=new SmallPair((Pair)o);
      }
      else
        p=(SmallPair) o;
      Lemma l=p.getLemma();
      for(int i=0;i<l.getSenses().size();i++)
      {
        for(int j=i+1;j<l.getSenses().size();j++)
        {
          if(p.getCounts()[j][i]>=(2*p.getLemma().getSenses().size()))        
            return false;
        }
      }        
    }
    return true;
  }


  /**
   * Returns the corpus name.
   * @return This corpus name.
   */
  public String getName()
  {
    return this.name;
  }
  /**
   * 
   * @return The dictionary used for tagging this corpus.
   */
  public Dictionary getDict() {
    return dict;
  }
  


  /**
   * 
   * If you want to use Wikipedia as a corpus you should look into Wiki.createInputFromArticle.
   * This method does nothing. It exist only for avoiding its accidental call. 
   * @param path Path for saving the SuperLemmas.
   * @throws Exception 
   */
  
  public void WriteSuperLemmas(String path) throws Exception
  {
  }


}
Source Code of gannuNLP.corpus.WikiCorpus

Related Classes of gannuNLP.corpus.WikiCorpus