Package gannuNLP.corpus

Source Code of gannuNLP.corpus.Corpus

package gannuNLP.corpus;

import gannuNLP.data.AmbiguousWord;
import gannuNLP.data.Count;
import gannuNLP.data.Input;
import gannuNLP.data.Lemma;
import gannuNLP.data.Sense;
import gannuNLP.data.SuperLemma;
import gannuNLP.dictionaries.Dictionary;
import gannuUtil.Util;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;


/**
* Class for loading a corpus of SGF files for calculating some useful
* statistics and extending bag of words with new samples.
* @author Francisco Viveros-Jiménez
*
*/
public class Corpus{   
  /**
   * Documents loaded from the corpus
   */
  ArrayList<Input> documents;
 
  /**
   * Source dictionary.
   */
  Dictionary dict;
  /**
   * The name of this corpus
   */
  String name;
  /**
   *
   * @return documents
   */
  public ArrayList<Input> getDocuments() {
    return documents;
  }

 
  @SuppressWarnings("unchecked")
  /**
   * Loads a corpus of sgf files from a target folder.
   * @param source The target folder containing the target SemCor files.
   * @param dict The base dictionary.
   * @param noTag True if you want to include undisambiguated words.
   * @throws Exception
   */
  public Corpus(String source, Dictionary dict,boolean noTag) throws Exception
  {
    this.dict=dict;
    File s=new File(source);
    this.name=s.getName();
    File d=new File("data/corpora/");
    d.mkdirs();
    File p=new File("./data/corpora/"+this.name+"_"+String.valueOf(noTag)+".gcf");
    if(p.exists())
    {
      System.out.println("Loading "+p.getName()+"...");
      this.documents=(ArrayList<Input>)Util.loadObject(p);       
      for(Input doc:this.documents)
      {
        for(AmbiguousWord Word:doc.getAmbiguousWords())
        {         
          Word.setDict(dict);
        }
      }               
    }
    else
    {   
      this.documents=new ArrayList<Input>();
      int i=1;
      ArrayList<File> files=Util.getAllSGFFiles(s);
     
      for(File f:files)
      {
        System.out.println(String.valueOf(i)+"/"+String.valueOf(files.size()));
        this.documents.add(new Input(f,null,dict,noTag,false));
        i++;
     
      this.documents.trimToSize();       
      System.out.println("Saving "+p.getName()+"...");
      Util.writeObject(p, this.getDocuments());       
    }     
  }
  /**
   * Returns the Sense having the higher corpus frequency.
   * @param lemma.
   * @return The most frequent sense in the corpus.
   * @throws Exception
   */
  public int getMFS(String lemma) throws Exception {
    int maxSenses=100;
    int counts[]=new int[maxSenses];
    int sel=0;
    for(int i=0;i<maxSenses;i++)
    {
      counts[i]=0;
    }
    if(!this.dict.isWeb())
    {     
      for(Input doc:this.documents)
      {
        for(AmbiguousWord word:doc.getAmbiguousWords())
        {
          if(word.getLemma().equals(lemma))
          {
            for(int answer: word.getCorrectSenseNumbers())
            {
              if(answer>=0)
              {
                counts[answer]++;
              }
            }
          }
        }
      }     
      for(int i=1;i<maxSenses;i++)
      {
        if(counts[i]>counts[sel])
        {
          sel=i;
        }
      }     
    }
    else//Search in wiki!!
    {
           
    }
    if(counts[sel]>0)
      return sel;
    else
      return -1;
  }
  /**
   * Add corpus samples into the document' AmbiguousWords
   * @param documents Target document
   * @throws Exception
   */
  public void transformBoW(Input document) throws Exception
  {
    for(ArrayList<AmbiguousWord> sentence:document.getSentences())
    {
      ArrayList<Sense> currentSenses=new ArrayList<Sense>();
      ArrayList<String> sample=new ArrayList<String>();
      String sampleText="";
      for(AmbiguousWord word:sentence)
      {
        sampleText+= word.getLemma()+" ";
        sample.add(word.getLemma());
        for(Sense answer: word.getCorrectSenses())
        {
          if(!currentSenses.contains(answer))
          {
            currentSenses.add(answer);
          }
        }
      }
      for(Sense sense:currentSenses)
      {
        sense.addBagOfWords(sampleText, sample,this.name);
      }
    }
  }
  /**
   * Tells is the given lemma accomplishes one sense per discourse rule in this tagged corpus.
   * @param lemma The given lemma.
   * @return True is the given lemma accomplishes the rule in this corpus.
   * @throws Exception
   */
  public boolean isOneSensePerDiscourse(String lemma) throws Exception
  {   
    int hits=0;
    for(Input doc:this.documents)
    {
      ArrayList<Integer> senses=new ArrayList<Integer>();
      for(AmbiguousWord word:doc.getAmbiguousWords())
      {
        if(lemma.equals(word.getLemma()))
        {
          hits++;
          for(int answer:word.getCorrectSenseNumbers())
          {
            Integer ans=new Integer(answer);
            if(!senses.contains(ans))
              senses.add(ans);
          }
        }
      }
      if(senses.size()>1)
        return false;
    }
    return true;
  }
 

  /**
   * Generates a copy of another corpus object.
   * @param original The original corpus
   */
  public Corpus(Corpus original)
  {
    this.dict=original.getDict();
    this.documents=new ArrayList<Input>(original.getDocuments().size());
    this.documents.addAll(original.getDocuments());
  }
  public Corpus(Dictionary dict,String name) {
    this.dict=dict;
    this.name=name;
  }

  /**
   * Returns the corpus name.
   * @return This corpus name.
   */
  public String getName()
  {
    return this.name;
  }
  /**
   *
   * @return The dictionary used for tagging this corpus.
   */
  public Dictionary getDict() {
    return dict;
  }
 
  /**
   * Method for writing SuperLemmas for using it with the DataBroker.
   * Use this method for extending dictionary definitions with examples.
   * The name of this corpus will be used as future reference for loading the samples with the DataBroker.
   * @param path The path where the DataBroker files are being stored.
   * @throws Exception
   */
  public void WriteSuperLemmas(String path) throws Exception
  {
    ArrayList<Count> counts=new ArrayList<Count>();
    ArrayList<String> lemmas=new ArrayList<String>();
    System.out.println("Processing samples!!!");
    int d=1;
    ArrayList<Update> updates=new ArrayList<Update>();
    for(Input document:this.documents)
    {
      System.out.println(String.valueOf(d)+"/"+String.valueOf(this.documents.size()));
      d++;
      int i=0;
      for(ArrayList<AmbiguousWord> sentence:document.getSentences())
      {
        ArrayList<String> bow=new ArrayList<String>(sentence.size());
        for(AmbiguousWord word:sentence)
        {
          int index=lemmas.indexOf(word.getLemma());
          if(index>=0)
          {
            counts.get(index).increaseFrequency();
          }
          else
          {
            lemmas.add(word.getLemma());
            counts.add(new Count(1.0,this.name));
          }
          bow.add(word.getLemma());
        }
        for(AmbiguousWord word:sentence)
        {
          for(int index:word.getCorrectSenseNumbers())
          {
            if(index>=0)
            {
              updates.add(new Update(word.getLemma(),index,document.getTextSentences().get(i), bow));
            }
          }
                   
        }
        i++;
      }
    }
    Collections.sort(updates);
    System.out.println("Saving samples!!!");
    d=1;
    Update p=null;
    SuperLemma s=null;
    Lemma l=null;
    for(Update u:updates)
    {
      System.out.println(String.valueOf(d)+"/"+String.valueOf(updates.size()));
      d++;
      if(p==null||!u.getLemma().equals(p.getLemma()))
      {
        if(s!=null)
        {
          l.addCount(counts.get(lemmas.indexOf(p.getLemma())));
          this.dict.WriteSuperLemma(path, s);
        }
        s=this.dict.loadSuperLemma(u.getLemma(),path);
        l=s.retrieveLemma(this.dict.getName());
      }
      Sense sens=l.getSenses().get(u.getSense());
      if(!sens.getSamples().contains(u.getText()))
      {
        sens.addBagOfWords(u.getText(), u.getBow(),this.name);
      }
       p=u;
    }
    if(s!=null)
    {
      l.addCount(counts.get(lemmas.indexOf(p.getLemma())));
      this.dict.WriteSuperLemma(path, s);
    }
  }
}
TOP

Related Classes of gannuNLP.corpus.Corpus

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.