Package gannuNLP.data

Source Code of gannuNLP.data.Input

package gannuNLP.data;



import gannuNLP.dictionaries.DataBroker;
import gannuNLP.dictionaries.Dictionary;
import gannuWSD.sensefilters.SenseFilter;
import gannuWSD.testing.Decision;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

/**
* Class for loading a sgf file.
* @author Francisco Viveros-Jiménez
*
*/
public class Input implements Serializable{
  /**
   *
   */
  private static final long serialVersionUID = 1L;
  /**
   * An ArrayList containing the document organized in paragraphs, sentences and words.
   */
  ArrayList<ArrayList<ArrayList<AmbiguousWord>>> contain;
  /**
   * An ArrayList containing this document sentences.
   */
    ArrayList<String> textSentences;
    /**
     * An ArrayList with the AmbiguousWords of this document. 
     */
    ArrayList<AmbiguousWord> ambiguousWords;
    /**
     * A simple index indicating the position (appearances) of each lemma on the document.
     */
    HashMap<String,ArrayList<Integer>>Index;
    /**
     * This file's name.
     */
    String name;
    /**
     * Returns an ArrayList with all the words of this document that are useful for WSD.
     * @return ambiguousWords
     */
    public ArrayList<AmbiguousWord> getAmbiguousWords() {
      return ambiguousWords;
    }
    /**
     * Returns a HashMap indicating the position (appearances) of each lemma on the document.
     * @return Index
     */
    public HashMap<String, ArrayList<Integer>> getIndex() {
      return Index;
    }
    /**
     * Parse a sgf text for RAW tagging.
     * @param targetText Target sgf text.
     * @param dict Source dictionary.
     * @throws Exception
     */
    public Input(String targetText, Dictionary dict)throws Exception
    {
      int running=0;
      int target=0;
     
      DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = fact.newDocumentBuilder();
        StringReader fin=new StringReader(targetText);
        BufferedReader in=new BufferedReader(fin);
        String line;
        String body="";
        while((line=in.readLine())!=null)
        {
          body+=line+"\n";
        }
        in.close();
        fin.close();
       
      Document test=builder.parse(new ByteArrayInputStream(body.getBytes("UTF-8")));
      this.name="tmp";
      NodeList ws=test.getElementsByTagName("wf");
      this.ambiguousWords=new ArrayList<AmbiguousWord>(ws.getLength());
      int index=0;
      NodeList paragraphs=test.getElementsByTagName("p");
      this.textSentences=new ArrayList<String>();
      this.contain=new ArrayList<ArrayList<ArrayList<AmbiguousWord>>>();
     
      for(int p=0;p<paragraphs.getLength()||(paragraphs.getLength()==0&&p==0);p++)
      {
        NodeList sn;
        if(paragraphs.getLength()==0)
        {
          sn=test.getElementsByTagName("s");
        }
        else
        {
          sn=((Element)paragraphs.item(p)).getElementsByTagName("s");
        }
        ArrayList<ArrayList<AmbiguousWord>> sentences=new ArrayList<ArrayList<AmbiguousWord>>();
        for(int s=0;s<sn.getLength();s++)
        {
          ArrayList<AmbiguousWord> words=new ArrayList<AmbiguousWord>();
          Element sentence=((Element)sn.item(s));
          NodeList wds=sentence.getElementsByTagName("wf");
          String text="";
          for(int w=0;w<wds.getLength();w++)
          {
            System.out.print(".");
            Element word=(Element)wds.item(w);
            AmbiguousWord nword=null;
           
            String pos=word.getAttribute("pos");
            if(!pos.equals(""))
              pos=pos.substring(0,1).toUpperCase();
            if(pos.equals("W"))
              pos="R";
           
            if(word.getAttribute("cmd").equals("tag"))
            { 
              running++;
              target++;
              if(pos.equals("N")||pos.equals("V")||pos.equals("R")||pos.equals("J")||pos.equals(""))
              {
                if(pos.equals("")||pos.equals("NA"))
                  nword=new AmbiguousWord(word.getAttribute("lemma"),word.getAttribute("wnsn"),index,dict);
                else
                  nword=new AmbiguousWord(word.getAttribute("lemma")+"_"+pos,word.getAttribute("wnsn"),index,dict);
                this.ambiguousWords.add(nword);
                index++;
                words.add(nword);
              }
            }         
            if(word.getTextContent()!=null)
              text+=word.getTextContent()+" ";
          }
         
          if(words.size()>0)
          {
            words.trimToSize();
            this.textSentences.add(text.trim());
            sentences.add(words);
          }
        }
        sentences.trimToSize();
        this.contain.add(sentences);
      }
      this.ambiguousWords.trimToSize();
      this.contain.trimToSize();
      this.Indexing();
      this.calculateTF();
      System.out.println();
      System.out.println("Running words: "+String.valueOf(running));
      System.out.println("Target words: "+String.valueOf(target));
       this.load();
       System.out.println();
    }
    /**
     * Parse a sgf file.
     * @param file Target file.
     * @param filter SenseFilter for excluding some senses from the words.
     * @param dict Source dictionary.
     * @param noTag Set it true for including words not manually tagged such as open-class words or untagged words.
     * Set it false for excluding these words.
     * @param fullLoad Set it true for loading all the LemmaObjects from the dictionary.
     * Set it false for loading AmbiguousWords with null LemmaObjects for memory saving.
     * @throws Exception
     */
    public Input(File file, SenseFilter filter, Dictionary dict,boolean noTag, boolean fullLoad)throws Exception
    {
      int running=0;
      int target=0;
     
      DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = fact.newDocumentBuilder();
        FileReader fin=new FileReader(file);
        BufferedReader in=new BufferedReader(fin);
        String line;
        String body="";
        while((line=in.readLine())!=null)
        {
          body+=line+"\n";
        }
        in.close();
        fin.close();
       
      Document test=builder.parse(new ByteArrayInputStream(body.getBytes("UTF-8")));
      this.name=file.getName();
      NodeList ws=test.getElementsByTagName("wf");
      this.ambiguousWords=new ArrayList<AmbiguousWord>(ws.getLength());
      int index=0;
      NodeList paragraphs=test.getElementsByTagName("p");
      this.textSentences=new ArrayList<String>();
      this.contain=new ArrayList<ArrayList<ArrayList<AmbiguousWord>>>();
     
      for(int p=0;p<paragraphs.getLength()||(paragraphs.getLength()==0&&p==0);p++)
      {
        NodeList sn;
        if(paragraphs.getLength()==0)
        {
          sn=test.getElementsByTagName("s");
        }
        else
        {
          sn=((Element)paragraphs.item(p)).getElementsByTagName("s");
        }
        ArrayList<ArrayList<AmbiguousWord>> sentences=new ArrayList<ArrayList<AmbiguousWord>>();
        for(int s=0;s<sn.getLength();s++)
        {
          ArrayList<AmbiguousWord> words=new ArrayList<AmbiguousWord>();
          Element sentence=((Element)sn.item(s));
          NodeList wds=sentence.getElementsByTagName("wf");
          String text="";
          for(int w=0;w<wds.getLength();w++)
          {
            System.out.print(".");
            Element word=(Element)wds.item(w);
            AmbiguousWord nword=null;
           
            String pos=word.getAttribute("pos");
            if(!pos.equals(""))
              pos=pos.substring(0,1).toUpperCase();
            if(pos.equals("W"))
              pos="R";
           
            if(word.getAttribute("cmd").equals("done"))
            { 
              running++;
              target++;
              if(pos.equals("N")||pos.equals("V")||pos.equals("R")||pos.equals("J")||pos.equals(""))
              {
                if(pos.equals("")||pos.equals("NA"))
                  nword=new AmbiguousWord(word.getAttribute("lemma"),word.getAttribute("wnsn"),index,dict);
                else
                  nword=new AmbiguousWord(word.getAttribute("lemma")+"_"+pos,word.getAttribute("wnsn"),index,dict);
                        if(filter!=null)
                  filter.prune(nword);
                this.ambiguousWords.add(nword);
                index++;
                words.add(nword);
              }
            }
            if(noTag&&nword==null)
            {
              running++;
              String value;
              if(word.getAttribute("lemma")!=null&&!word.getAttribute("lemma").equals(""))
                value=word.getAttribute("lemma");
              else
              {
                if(word.getAttribute("pos").equals(""))
                  value=word.getTextContent().toLowerCase();
                else
                  value=word.getTextContent().toLowerCase()+"_"+word.getAttribute("pos").substring(0,1);
              }
               
             
              nword=new AmbiguousWord(value,index,dict);
              this.ambiguousWords.add(nword);
              index++;
              words.add(nword);         
           
            if(word.getTextContent()!=null)
              text+=word.getTextContent()+" ";
          }
         
          if(words.size()>0)
          {
            words.trimToSize();
            this.textSentences.add(text.trim());
            sentences.add(words);
          }
        }
        sentences.trimToSize();
        this.contain.add(sentences);
      }
      this.ambiguousWords.trimToSize();
      this.contain.trimToSize();
      this.Indexing();
      this.calculateTF();
      System.out.println();
      System.out.println("Running words: "+String.valueOf(running));
      System.out.println("Target words: "+String.valueOf(target));
      if(fullLoad)
      {
        this.load();
        System.out.println();
     
    }

  /**
     * Method for loading all the corresponding LemmaObjects from the base dictionary.
     * @throws Exception
     */
    public void load() throws Exception {
      //Loading lemmas
      double poly=0.0;
      int count=0;
      System.out.println("Loading lemmas from dictionary please wait")
      for(Entry<String,ArrayList<Integer>> entry:this.Index.entrySet())
      {
        System.out.print(".");
        count++;
        AmbiguousWord word=this.ambiguousWords.get(entry.getValue().get(0).intValue());
       
        Lemma l=word.getLemmaObject();
        if(l!=null)
        {
          poly+=((double)word.getSenses().size());
          for(int i=1;i<entry.getValue().size();i++)
          {
            count++;
            word=this.ambiguousWords.get(entry.getValue().get(i).intValue());
            word.setLemmaObject(l);
            poly+=((double)word.getSenses().size());
          }
        } 
        else
        {
          System.out.print("!");
        }
      }
      System.out.println("\nPolysemy: "+String.valueOf(poly/((double)count)));
  }
    /**
     * Calculates the TF value for all the ambiguous words in this document.
     */
    private void calculateTF() {
      for(AmbiguousWord word:this.ambiguousWords)
      {
        if(word.getCorrectSenseNumbers().length>0)
        {
          ArrayList<Integer> aux=this.Index.get(word.getLemma());
          word.setTf((double)aux.size()/(double)this.ambiguousWords.size());
        }
        else
        {
          double tf=0.0;
          for(AmbiguousWord tword:this.ambiguousWords)
          {
            if(word.getLemma().equals(tword.getLemma()))
            {
              tf+=1.0;
            }
          }
          word.setTf(tf/(double)this.ambiguousWords.size());
        }
      }
    }
    /**
     * Creates the index of lemma appearances.
     */
    private void Indexing()
    {
      Index=new HashMap<String,ArrayList<Integer>>();
      int i=0;
      for(AmbiguousWord word:this.ambiguousWords)
      {
        if(word.getCorrectSenseNumbers().length>0)
        {
          if(Index.containsKey(word.getLemma()))
          {
            Index.get(word.getLemma()).add(new Integer(i));
          }
          else
          {
            ArrayList<Integer> aux=new ArrayList<Integer>(5);
            aux.add(new Integer(i));
            Index.put(word.getLemma(), aux);
          }
        }
        i++;
      }
    }

    /**
     * Returns the name of this file.
     */
    public String toString()
    {
      return name;
    }
    /**
     *
     * @return All the sentences in this document
     */
    public ArrayList<ArrayList<AmbiguousWord>> getSentences() {
      ArrayList<ArrayList<AmbiguousWord>> sentences=new ArrayList<ArrayList<AmbiguousWord>>();
      for(ArrayList<ArrayList<AmbiguousWord>> ss:this.contain)
      {
        sentences.addAll(ss);
      }
      return sentences;
    }
    /**
     *
     * @return this.textSentences
     */
    public ArrayList<String> getTextSentences()
    {
      return this.textSentences;
    }
    /**
     * Returns the TF-IDF value for a word.
     * @param word Target word.
     * @return The TF-IDF value.
     */
    public double getTFIDF(String word) {
      ArrayList<Integer> aux=this.Index.get(word);
      if(aux!=null)
        return this.ambiguousWords.get(aux.get(0)).getTf()*this.ambiguousWords.get(aux.get(0)).getIDF();
      return 0.0;
    }
    /**
     * Generates an SGF file from plain text.
     * @param target Target file containing the plain text.
     * @param data Base DataBroker instance.
     * @throws Exception
     */
    public static String generateSGFFromRaw(File target, DataBroker data)throws Exception {
    FileReader fin=new FileReader(target);
    BufferedReader in=new BufferedReader(fin);
    String ftext="";
    ArrayList<String> text=new ArrayList<String>();
    String line;
    while((line=in.readLine())!=null)
    {
      text.add(line);               
    }
    ArrayList<ArrayList<ArrayList<String>>> chunks=new ArrayList<ArrayList<ArrayList<String>>>();
    ArrayList<ArrayList<ArrayList<String>>> lemmas=data.lemmatize(text,chunks);
    chunks.trimToSize();
    FileWriter fout=new FileWriter(target.getCanonicalPath()+".sgf");
    BufferedWriter out=new BufferedWriter(fout);
    ftext+="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    ftext+="<contextfile concordance=\"raw\">\n";
    ftext+="\t<context filename=\""+target.getName()+"\" paras=\"yes\">\n";           
    int p=1;
    int s;
    String pos;
    for(ArrayList<ArrayList<String>> paragraph:lemmas)
    {
      ftext+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
      s=1;
      ArrayList<ArrayList<String>> oparagraph=chunks.get(p-1);
      for(ArrayList<String> sentence:paragraph)
      {
        ArrayList<String> osentence=oparagraph.get(s-1);
        ftext+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
        int w=0;
        for(String word:sentence)
        {
          pos=word.substring(word.length()-1);
          String oword=osentence.get(w);
         
          if(Dictionary.isPunctuation(pos))
          {
            ftext+="\t\t\t\t<punc>"+oword+"</punc>\n";
          }
          else
          {
            if(Dictionary.getPOS(pos)>=0)
            {
              ftext+="\t\t\t\t<wf cmd=\"tag\" pos=\""+pos+"\" lemma=\""+word.substring(0,word.length()-2)+"\" wnsn=\"0\">"+oword+"</wf>\n";
            }
            else
            {
              ftext+="\t\t\t\t<wf cmd=\"ignore\" pos=\""+pos+"\">"+oword+"</wf>\n";
            }
           
          }
         
          w++;
        }
        ftext+="\t\t\t</s>\n";
        s++;
      }     
      ftext+="\t\t</p>\n";
      p++;
    }
    ftext+="\t</context>\n";
    ftext+="</contextfile>\n";
    out.write(ftext);
    out.close();
    fout.close();
    in.close();
    fin.close();
    return ftext;
  }
    /**
     * Modifies a target sgf file by adding new wnsn attributes generated with a WSD algorithm.
     * @param file Target file name.
     * @param ds Decisions for modifying the wnsn attributes.
     */
  public static void modifyAndSaveSGF(String file, ArrayList<Decision> ds) throws Exception{

    int index=0;
    int idx;
    String line;
    String text="";
    FileReader fin=new FileReader(file);
    BufferedReader in=new BufferedReader(fin);
    while((line=in.readLine())!=null)
    {
      idx=line.indexOf("cmd=\"tag");
      if(idx>=0)
      {
        idx=line.indexOf("wnsn=\"");
        text+=line.substring(0,idx);
        text+="wnsn=\"";
        Decision d=ds.get(index);
        if(d.isAttempted())
        {
          int wnsns[]=d.getAnswers();
          if(wnsns.length>0)
          {
            text+=String.valueOf(wnsns[0]+1);
            for(int i=1;i<wnsns.length;i++)
            {
              text+=";"+String.valueOf(wnsns[i]+1);
            }
          }
          else
          {
            text+="0";
          }
        }
        else
        {
          text+="0";
        }
        index++;
        text+=line.substring(idx+7);
      }
      else
      {
        text+=line;
      }
      text+="\n";
    }   
    fin.close();
    in.close();
    FileWriter fout=new FileWriter(file);
    BufferedWriter out=new BufferedWriter(fout);
    out.write(text);
    out.close();
    fout.close();
  }

   
}
TOP

Related Classes of gannuNLP.data.Input

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.