Package gannuNLP.data

Examples of gannuNLP.data.Lemma


      {
        p=new SmallPair((Pair)o);
      }
      else
        p=(SmallPair) o;
      Lemma l=p.getLemma();
      for(int i=0;i<l.getSenses().size();i++)
      {
        for(int j=i+1;j<l.getSenses().size();j++)
        {
          if(p.getCounts()[j][i]>=(2*p.getLemma().getSenses().size()))       
            return false;
        }
      }       
View Full Code Here


    Collections.sort(updates);
    System.out.println("Saving samples!!!");
    d=1;
    Update p=null;
    SuperLemma s=null;
    Lemma l=null;
    for(Update u:updates)
    {
      System.out.println(String.valueOf(d)+"/"+String.valueOf(updates.size()));
      d++;
      if(p==null||!u.getLemma().equals(p.getLemma()))
      {
        if(s!=null)
        {
          l.addCount(counts.get(lemmas.indexOf(p.getLemma())));
          this.dict.WriteSuperLemma(path, s);
        }
        s=this.dict.loadSuperLemma(u.getLemma(),path);
        l=s.retrieveLemma(this.dict.getName());
      }
      Sense sens=l.getSenses().get(u.getSense());
      if(!sens.getSamples().contains(u.getText()))
      {
        sens.addBagOfWords(u.getText(), u.getBow(),this.name);
      }
       p=u;
    }
    if(s!=null)
    {
      l.addCount(counts.get(lemmas.indexOf(p.getLemma())));
      this.dict.WriteSuperLemma(path, s);
    }
  }
View Full Code Here


  @Override
  public Lemma getLemma(String lemma) throws Exception {
  URL url=new URL(this.path+"wiki/"+lemma.replace(" ", "_")+this.disambiguationWord);
  Lemma l=null;
 
  int iidx=this.badSearches.indexOf(lemma);
  if(iidx<0)
  {
    Document xml=this.loadURL(url);
    String text="";
    if(xml!=null)
      text=xml.getRootElement().getValue();
    if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
    {
      ArrayList<URL> urls=new ArrayList<URL>();
      ArrayList<Integer> levels=new ArrayList<Integer>();
      ArrayList<Integer> hindex=new ArrayList<Integer>();
      ArrayList<String> heads=new ArrayList<String>();
      for(Element div:xml.getDescendants(new ElementFilter("div")))
      {
        if(div.getAttributeValue("id")!=null&&div.getAttributeValue("id").equals("content"))
        {
          ElementFilter f=new ElementFilter("a");
          Filter<? extends Content> or=f.or(new ElementFilter("span"));
          for(Content c:div.getDescendants(or))
          {
            if(c.getCType()==CType.Element)
            {
              Element word=(Element)c;
              if(word.getName().equals("a"))
              {
                String href=word.getAttributeValue("href");
                if(href!=null&&href.startsWith("/wiki"))
                {
                  urls.add(new URL(this.path+href));
                  hindex.add(new Integer(heads.size()-1));
                }
              }
              else
              {
                if(word.getAttributeValue("class")!=null && word.getAttributeValue("class").equals("mw-headline"))
                {
                  heads.add(word.getAttributeValue("id"));
                  Element e=word.getParentElement();
                  levels.add(new Integer(Integer.parseInt(e.getName().substring(1))-2));               
                }
              }
            }
          }
          break;
        }
      }
      ArrayList<Sense> senses=new ArrayList<Sense>(urls.size());
      for(int i=0;i<urls.size();i++)
      {
        String urlx=urls.get(i).getFile().replace("/wiki/", "");
        if(!this.isNotAnArticle(urlx))//it is a sense
        {                   
          senses.add(this.getSense(urlx.replace("/", "")));
       
      }
      ArrayList<Count> counts=new ArrayList<Count>();
      counts.add(this.getWikiCounts(lemma));      ;     
      l=new Lemma(lemma,"",senses,counts,this.name);
    }
    else
    {
      url=new URL(this.path+"wiki/"+lemma.replace(" ", "_"));
      xml=this.loadURL(url);
      if(xml!=null)
        text=xml.getRootElement().getValue();
      if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
      {
        if(text.contains(this.disambiguationMSG)&&!this.jump)
        {         
          this.jump=true;
          ArrayList<Sense> senses=new ArrayList<Sense>();
          Element body=this.getContentNode(xml);
          for(Element e:body.getChildren())
          {
           
            if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("dablink"))
            {
              for(Content c:e.getContent())
              {
                if(c.getCType().equals(CType.Element))
                {
                  Element a=((Element)c);
                  if(a.getName().equals("a"))
                  {
                    String sid=a.getAttributeValue("href");
                    sid=sid.substring(sid.indexOf("wiki/")+5);
                    sid=sid.replace(this.disambiguationWord, "");
                    Lemma ll=this.getLemma(sid);
                    if(ll!=null)
                    {
                      for(Sense s:ll.getSenses())
                      {
                        if(!senses.contains(s))
                          senses.add(s);
                      }
                    }                   
                  }
                }
              }
            }
          }
          this.jump=false;
          if(senses.size()>0)
          {
            ArrayList<Count> counts=new ArrayList<Count>();
            counts.add(this.getWikiCounts(lemma));
            l=new Lemma(lemma,"",senses,counts,this.name);
          }
        }
        else
        {
          ArrayList<Sense> senses=new ArrayList<Sense>(1);
          String urlx=url.getFile().replace("/wiki/", "");
          if(!this.isNotAnArticle(urlx))//it is a sense
          {
            senses.add(this.getSense(urlx));
            ArrayList<Count> counts=new ArrayList<Count>();
            counts.add(this.getWikiCounts(lemma));
            l=new Lemma(lemma,"",senses,counts,this.name);
          }
        }
      }
    }       
  }
View Full Code Here

            }
          }
          //Lematize the wiki word
          String word=current.getValue();
          String lemma=word;
          Lemma l=db.getLemma(word);
          href=href.substring(href.indexOf("wiki/")+5);
          boolean ac=true;
          if(l==null)
          {
              l=db.getLemma(href);
              if(l!=null)
                lemma=l.getLemma();
              ac=false;
          }
          if(l!=null)
          {
            int i=0;
            boolean ban=false;
            for(Sense sense:l.getSenses())
            {
              i++;
              if(sense.getSid().equals(href))
              {
                ban=true;
                break;
              }
            }
            String wnsn="";
            if(ban)
            {
              wnsn=String.valueOf(i);
            }
            else
            {
              if(ac)
              {
                l=db.getLemma(href);
                if(l!=null)
                {
                  i=0;
                  ban=false;
                  for(Sense sense:l.getSenses())
                  {
                    i++;
                    if(sense.getSid().equals(href))
                    {
                      ban=true;
                      break;
                    }
                  }
                  if(ban)
                    wnsn=String.valueOf(i);
                }
              }
            }
            if(wnsn.equals("")&&l!=null)
            {
              Sense sense=this.getSense(href);
              ban=false;
              i=0;
              for(Sense sx:l.getSenses())
              {
                i++;
                if(sense.itContainsTheSameSamples(sx))
                {
                  ban=true;
                  break;
                }
              }
            }
            if(ban)
              wnsn=String.valueOf(i);
            if(wnsn.equals(""))
            {  stack.addAll(0,current.getContent());
              out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&amp;")+" -->\n");
            }
            else
            {
              sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
                  +"\" wnsn=\""+wnsn
                  +"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
              sentence+=word;
              sentence+="</wf>\n";
            }
          }
          else
View Full Code Here

    {
      String lemma=Key.getKey();
      ArrayList<Sense> senses=this.getSenses(lemma);
      ArrayList<Count> counts=this.getCounts(lemma);
      SuperLemma s=this.loadSuperLemma(lemma,path);
      Lemma lemmaO=new Lemma(lemma,senses.get(0).getPos(),senses,counts,this.name);
      s.addLemma(lemmaO);
      this.WriteSuperLemma(path,s);
      i++;
      if(i%1000==0)
      {
View Full Code Here

   * @return An ArrayList having paragraphs as elements. Each paragraph (Element) contains
   * a set of sentences. Each sentence contains a set of words.
   */
  public ArrayList<ArrayList<ArrayList<String>>> lemmatize(ArrayList<String> text,ArrayList<ArrayList<ArrayList<String>>> chunks) throws Exception{
    ArrayList<ArrayList<ArrayList<String>>> ptext=new ArrayList<ArrayList<ArrayList<String>>>(text.size());     
    Lemma l=null;
    this.setTagger();
    for(String p:text)
    {
      List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(p));
      ArrayList<ArrayList<String>> paragraph=new ArrayList<ArrayList<String>>(sentences.size());
View Full Code Here

      }
      for(String plemma:Util.removeDuplicates(lemmatas))
      {
        for(String posTag:posTags)
        {
          Lemma l=this.getLemma(plemma+posTag);
          if(l!=null)
          {
            for(Sense s:l.getSenses())
            {
              if(!senses.contains(s))
              {
                senses.add(s);
              }
View Full Code Here

        }
      }
      File ft=new File(this.path+"/data/"+this.getName()+"/"+Dictionary.normalizeLemmaforFile(lemma)+".slm");
      if(ft.exists())
      {
        Lemma l=this.getLemmaNoModifiers(lemma);
        if(l!=null)         
          for(Count c:l.getCounts())
            count+=c.getFrequency();       
      }
      this.lemmas.add(lemma);
      this.counts.add(new Double(count));
    }
View Full Code Here

  public Lemma getLemmaNoModifiers(String lemma)throws Exception
 
    File d=new File("./data/lemmas/"+this.getName()+"/");
    d.mkdirs();
    SuperLemma s=this.loadSuperLemma(lemma,"./data/"+this.getName()+"/");
    Lemma l=null;
    if(s.getLemmas().size()>0)
    {
      l=s.retrieveLemma(this.source.toString());
      if(l!=null)
      {
        l=new Lemma(l,this.sampleSources);                   
      }

    }
    if(l==null&&this.source.isWeb())
    {
View Full Code Here

      d.mkdirs();
      d=new File("./data/lemmas/"+this.getName()+"/");
      d.mkdirs();
    }
    SuperLemma s=this.loadSuperLemma(lemma,"./data/"+this.getName()+"/");
    Lemma l=null;
    if(s.getLemmas().size()>0)
    {
      l=s.retrieveLemma(this.source.toString());
      if(l!=null)
      {
        l=new Lemma(l,this.sampleSources);                   
      }

    }
    if(l==null&&this.source.isWeb())
    {
      l=this.source.getLemma(lemma);
      if(l!=null)
      {
        s.addLemma(l);
        this.WriteSuperLemma("./data/"+this.getName()+"/",s);
      }
    }
    if(l!=null)
    {
      for(BoWModifier mod:this.modifiers)
      {
        mod.modifyBow(l);
      }     
    }
   
    if(l!=null)
    {
      l.trim();
      Util.writeObject(f, l);
    }
    return l;
  }
View Full Code Here

TOP

Related Classes of gannuNLP.data.Lemma

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.