Source Code of gannuNLP.dictionaries.Wiki

package gannuNLP.dictionaries;


import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.jdom2.Content;
import org.jdom2.Content.CType;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.filter.ElementFilter;
import org.jdom2.filter.Filter;
import org.jdom2.input.JDOMParseException;
import org.jdom2.input.SAXBuilder;


import gannuNLP.data.Count;
import gannuNLP.data.Lemma;
import gannuNLP.data.Relation;
import gannuNLP.data.Sense;
import gannuUtil.Util;


/**
 * Connector to Wikipedia dictionary. 
 * The paragraphs before the Table of Content are taken as definitions.
 * Please set up an URL to your own Wikipedia mirror with the setPath method.
 * The valid versions of the Wikipedia connector are en (English) and es (Spanish).
 * @author Francisco Viveros-Jim&eacute;nez
 *
 */
public class Wiki extends Dictionary {
  /**
   * 
   */
  private static final long serialVersionUID = 1L;
  
  /**
   * String for identifying a missing page. E.g. "Wikipedia does not have an article with this exact name".
   */
  String missingMSG;
  /**
   * String for identifying a bad Wiki URL. E.g. "Wikimedia Error".
   */
  String wikiErrorMSG;
  /**
   * String for identifying definitions inside specified section of a Wiki article.
   * E.g. "redirectToFragment(\"#". 
   */
  String redirect;
  /**
   * String for identifying the disambiguation link of an article. E.g. "For other uses".
   */
  String disambiguationMSG;
  /**
   * String for identifying the section containing the search hits in the Wiki search page.
   * E.g. "Results ".
   */
  String countText;
  /**
   * Preposition written before the search hits (white space must be included). E.g. "of ".  
   */
  String countPrepIn;
  /**
   * Preposition written after the search hits (white space must be included). E.g. " for". 
   */
  String countPrepOut;
  
  /**
   * String for identifying a disambiguation page. E.g. "_(disambiguation)".
   */
  String disambiguationWord;
              
  /**
   *   Internal flag for avoiding infinite loops while crawling.
   */
  private boolean jump;
  /**
   * Internal counts for avoiding infinite connection attempts while crawling.
   */
  int attempts;
  /**
   * List of bad URLs.
   */
  ArrayList<String> badSearches;
  /**
   * Instantiates a new Wikipedia connector. 
   * You must use selectLanguage, setPath and load methods before querying Wikipedia.
   */
  
    public Wiki()
    {
      super();
      this.attempts=10;      
      this.isWeb=true;
      this.jump=false;
      this.usesPOSTag=false;
    }
    /**
     * Sets all the necessary values for processing Wikipedia of an specified language.
     * English ("en") and Spanish ("es") are implemented.
     * Please modify this method for adding support for other languages. 
     * @param language String for identifying the language. Please use the exact same code as Wikipedia. 
   * E.g. "en" for English or "es" for Spanish.
     */
    public void setVersion(String language)
    {
      this.language=language;
      if(language.equals("en"))
      {
        this.disambiguationWord="_(disambiguation)";          
          this.name="English Wikipedia";
          this.missingMSG="Wikipedia does not have an article with this exact name";
          this.wikiErrorMSG="Wikimedia Error";
          this.redirect="redirectToFragment(\"#";
          this.disambiguationMSG="For other uses";
          this.countText="Results ";
          this.countPrepIn="of ";
          this.countPrepOut=" for";
      }
      if(language.equals("es"))
      {
        this.disambiguationWord="_(desambiguaci�n)";
          this.name="Wikipedia en Espa�ol";
          this.missingMSG="Wikipedia a�n no tiene una p�gina llamada";
          this.wikiErrorMSG="Wikimedia Error";
          this.redirect="redirectToFragment(\"#";
          this.disambiguationMSG="Para otras acepciones";
          this.countText="Resultados ";
          this.countPrepIn="de ";
          this.countPrepOut=" para";
      }
    }
    /**
     * Method for searching the XML node containing the actual Wiki article of a target document.
     * @param xml Target document.
     * @return The XML node containing the article. 
     */
    private Element getContentNode(Document xml)
    {
      //Search for mw-content-text
    //Gloss starts inside the first <p> nodes and ends with the apparition of another node type (like <div> or <h2>)
    Element root=xml.getRootElement();
    Element body=null;
    for(Element e:root.getChildren())
    {
      if(e.getName().equals("body"))
      {
        body=e;
        break;
      }
    }
    Element content=null;
    for(Element e:body.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttributeValue("id").equals("content"))
      {
        content=e;
        break;
      }
    }
    Element bodycontent=null;
    for(Element e:content.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("bodyContent"))
      {
        bodycontent=e;
        break;
      }
    }
    Element gloss=null;
    for(Element e:bodycontent.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("mw-content-text"))
      {
        gloss=e;
        break;
      }
    }
    return gloss; 
    }
  @SuppressWarnings("unchecked")
  @Override
  public void loadCoreData() throws Exception {
    URL url=new URL(this.path);    
    this.glossCount=0.0;
    try
    {
      Document xml=this.loadURL(url);
      if(language.equals("en"))
      {
        for(Element div:xml.getDescendants(new ElementFilter("a")))
        {
          if(div.getAttributeValue("href")!=null&&div.getAttributeValue("href").equals("/wiki/Special:Statistics"))
          {
            String str=div.getValue();
            System.out.println(str);
            this.glossCount=Double.parseDouble(str.replace(",", ""));
            break;
          }
        }
      }
      else
      {
        for(Element div:xml.getDescendants(new ElementFilter("li")))
        {
          if(div.getAttributeValue("id")!=null && div.getAttributeValue("id").contains("lang-"))
          {
            for(Element span:div.getDescendants(new ElementFilter("span")))
            {
              if(span.getAttributeValue("lang").equals(language))
              {
                String str=div.getValue();
                str=str.substring(0,str.indexOf(" articles"));
                str=str.split("More than ")[1];
                System.out.println(str);
                this.glossCount=Double.parseDouble(str.replace(",", ""));
              }
            }
          }
        }
      }
      if(language.equals("es"))
      {
        String f=xml.getRootElement().getValue();
        f=f.substring(f.indexOf(" art�culos en espa�ol")-30,f.indexOf("art�culos en espa�ol")).split("\n")[1].replace("\u00a0","");
        this.glossCount=Double.parseDouble(f);
      }
      this.wordCount=this.glossCount*320.0;
      url=new URL(this.path);
      url.openStream();


    }
    catch(Exception e)
    {
      System.out.println("Warning wiki mirror is unreacheable, switching to offline mode!");
    }
    File f=new File("./data/Wiki.bws");
    if(f.exists())
      this.badSearches=(ArrayList<String>)Util.loadObject(f);
    else
      this.badSearches=new ArrayList<String>();
  }


  @Override
  public void load(String sampleSources) throws Exception {
    this.loadCoreData();
    
  }


  @Override
  /**
   * Under construction.
   */
  public void parseSamplesFromDictionary() throws Exception {
    //TODO


  }






  @Override
  public Lemma getLemma(String lemma) throws Exception {
  URL url=new URL(this.path+"wiki/"+lemma.replace(" ", "_")+this.disambiguationWord);
  Lemma l=null;
  
  int iidx=this.badSearches.indexOf(lemma);
  if(iidx<0)
  {
    Document xml=this.loadURL(url);
    String text="";
    if(xml!=null)
      text=xml.getRootElement().getValue();
    if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
    {
      ArrayList<URL> urls=new ArrayList<URL>();
      ArrayList<Integer> levels=new ArrayList<Integer>();
      ArrayList<Integer> hindex=new ArrayList<Integer>();
      ArrayList<String> heads=new ArrayList<String>();
      for(Element div:xml.getDescendants(new ElementFilter("div")))
      {
        if(div.getAttributeValue("id")!=null&&div.getAttributeValue("id").equals("content"))
        {
          ElementFilter f=new ElementFilter("a");
          Filter<? extends Content> or=f.or(new ElementFilter("span"));
          for(Content c:div.getDescendants(or))
          {
            if(c.getCType()==CType.Element)
            {
              Element word=(Element)c;
              if(word.getName().equals("a"))
              {
                String href=word.getAttributeValue("href");
                if(href!=null&&href.startsWith("/wiki"))
                {
                  urls.add(new URL(this.path+href));
                  hindex.add(new Integer(heads.size()-1));
                }
              }
              else
              {
                if(word.getAttributeValue("class")!=null && word.getAttributeValue("class").equals("mw-headline"))
                {
                  heads.add(word.getAttributeValue("id"));
                  Element e=word.getParentElement();
                  levels.add(new Integer(Integer.parseInt(e.getName().substring(1))-2));                
                }
              }
            }
          }
          break;
        }
      }
      ArrayList<Sense> senses=new ArrayList<Sense>(urls.size());
      for(int i=0;i<urls.size();i++)
      {
        String urlx=urls.get(i).getFile().replace("/wiki/", "");
        if(!this.isNotAnArticle(urlx))//it is a sense
        {                    
          senses.add(this.getSense(urlx.replace("/", "")));
        }  
      }
      ArrayList<Count> counts=new ArrayList<Count>();
      counts.add(this.getWikiCounts(lemma));      ;      
      l=new Lemma(lemma,"",senses,counts,this.name);
    }
    else
    {
      url=new URL(this.path+"wiki/"+lemma.replace(" ", "_"));
      xml=this.loadURL(url);
      if(xml!=null)
        text=xml.getRootElement().getValue();
      if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
      {
        if(text.contains(this.disambiguationMSG)&&!this.jump)
        {          
          this.jump=true;
          ArrayList<Sense> senses=new ArrayList<Sense>();
          Element body=this.getContentNode(xml);
          for(Element e:body.getChildren())
          {
            
            if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("dablink"))
            {
              for(Content c:e.getContent())
              {
                if(c.getCType().equals(CType.Element))
                {
                  Element a=((Element)c);
                  if(a.getName().equals("a"))
                  {
                    String sid=a.getAttributeValue("href");
                    sid=sid.substring(sid.indexOf("wiki/")+5);
                    sid=sid.replace(this.disambiguationWord, "");
                    Lemma ll=this.getLemma(sid);
                    if(ll!=null)
                    {
                      for(Sense s:ll.getSenses())
                      {
                        if(!senses.contains(s))
                          senses.add(s);
                      }
                    }                    
                  }
                }
              }
            }
          }
          this.jump=false;
          if(senses.size()>0)
          {
            ArrayList<Count> counts=new ArrayList<Count>();
            counts.add(this.getWikiCounts(lemma));
            l=new Lemma(lemma,"",senses,counts,this.name);
          }
        }
        else
        {
          ArrayList<Sense> senses=new ArrayList<Sense>(1);
          String urlx=url.getFile().replace("/wiki/", "");
          if(!this.isNotAnArticle(urlx))//it is a sense
          {
            senses.add(this.getSense(urlx));
            ArrayList<Count> counts=new ArrayList<Count>();
            counts.add(this.getWikiCounts(lemma));
            l=new Lemma(lemma,"",senses,counts,this.name);
          }
        }
      }
    }        
  }
  if(l==null&&iidx<0)
  {
    this.badSearches.add(lemma);
    this.badSearches=Util.removeDuplicates(this.badSearches);
    Collections.sort(this.badSearches);
    File f=new File("./data/Wiki.bws");
    Util.writeObject(f, this.badSearches);
  }
  return l;
}
  /**
   * Download an specified URL.
   * @param url Target URL.
   * @return The HTML document.
   * @throws Exception
   */
  Document loadURL(URL url) throws Exception{
    SAXBuilder builder=new SAXBuilder();
    Document xml=null;
    HttpURLConnection con=(HttpURLConnection) url.openConnection();
    con.setInstanceFollowRedirects(true);
    
    InputStream in;
    try
    {
       in = con.getInputStream();
    }
    catch(Exception e)
    {
      in=con.getErrorStream();
    }
    String encoding = con.getContentEncoding();
    encoding = encoding == null ? "UTF-8" : encoding;
    if(encoding.equals("gzip"))
    {
      in=new GZIPInputStream(con.getInputStream());
      encoding="UTF-8";
    }
    
    String body = IOUtils.toString(in, encoding).replace(" & ", "&amp;");
    for(int i=0;i<this.attempts;i++)
    {
      try
      {
        xml=(Document) builder.build(new StringReader(body));
        i+=this.attempts+1;
      }
      catch(JDOMParseException ex)
      {
        int line=ex.getLineNumber();
        int col=ex.getColumnNumber();
        int cline=1;
        int index=0;
        while(cline<line)
        {
          index=body.indexOf("\n",index)+1;
          cline++;
        }
        //line found
        index+=col;
        if(index>=body.length())
          index=body.length()-1;
        while(index>=0&&body.charAt(index)!='&')
        {
          index--;
        }
        if(index>-1)
        {
          String firstHalf=body.substring(0,index);
          String otherHalf=body.substring(index+1);
          body=firstHalf+"&amp;"+otherHalf;
        }
                
      }      
    }
    return xml;
  }
  
  /**
   * Download a target URL as plain text.
   * @param url Target URL.
   * @return Plain text of the URL.
   * @throws Exception
   */
  String loadURLAsText(URL url) throws Exception{
    HttpURLConnection con=(HttpURLConnection) url.openConnection();
    con.setInstanceFollowRedirects(true);
    
    InputStream in;
    try
    {
       in = con.getInputStream();
    }
    catch(Exception e)
    {
      in=con.getErrorStream();
    }
    String encoding = con.getContentEncoding();
    encoding = encoding == null ? "UTF-8" : encoding;
    if(encoding.equals("gzip"))
    {
      in=new GZIPInputStream(con.getInputStream());
      encoding="UTF-8";
    }
    
    String body = IOUtils.toString(in, encoding).replace(" & ", "&amp;");
    return body;
  }
  
  /**
   * Return the search hits for a corresponding word.
   * @param lemma Target word.
   * @return A count object containing the search hits.
   * @throws Exception
   */
  public Count getWikiCounts(String lemma) throws Exception{
    double w=0.0;
    URL count=new URL(this.path+"/w/index.php?title=Special%3ASearch&profile=default&search="+lemma.replace(" ", "%20")+"&fulltext=Search");    
    Document xml=this.loadURL(count);  
    if(xml!=null)
    {
      String aux="mw-search-formheader";
      for(Element e:this.getContentNode(xml).getDescendants(new ElementFilter("div")))
      {
        if(aux.equals(e.getAttributeValue("class")))
        {
          for(Element x:e.getDescendants(new ElementFilter("div")))
          {
            aux="results-info";
            if(aux.equals(x.getAttributeValue("class")))
            {
              aux=e.getValue().substring(e.getValue().indexOf(this.countText)+this.countText.length());;
              aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
              aux=aux.substring(0,aux.indexOf(this.countPrepOut));
              aux=aux.replaceAll("\\D", "");        
              w=Double.parseDouble(aux);
              break;
            }
          }
          break;
        }
      }
    }
    else
    {
      String aux=this.loadURLAsText(count);
      aux=aux.substring(aux.indexOf("mw-search-formheader"));
      aux=aux.substring(aux.indexOf("results-info"));
      aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
      aux=aux.substring(0,aux.indexOf(this.countPrepOut));
      aux=aux.replaceAll("\\D", "");
      w=Double.parseDouble(aux);
    }
    Count c=new Count(w, this.name);
    return c;
  }


  
  public Sense getSense(String sid)throws Exception {    
    Sense s;    
    File f=new File("./data/wiki/"+Dictionary.normalizeLemmaforFile(this.getCompleteName())+"/"+Dictionary.normalizeLemmaforFile(sid)+".wco");
    File dir=new File("./data/wiki/"+Dictionary.normalizeLemmaforFile(this.getCompleteName())+"/");
    dir.mkdirs();
    if(f.exists())
    {
      s=(Sense)Util.loadObject(f);
    }
    else
    {
      ArrayList<String> syns=new ArrayList<String>(1);
      syns.add(sid);
      s=new Sense(sid,"",syns);
      URL url=new URL(this.path+"wiki/"+sid.replace(" ", "_"));
      Document xml=this.loadURL(url);
      Element body=this.getContentNode(xml);
      //Check if definition is a section
      if(!xml.getRootElement().getValue().contains(this.redirect))
      {
        boolean start=false;
        
        for(Element e:body.getChildren())
        {
          if(e.getName().equals("p"))
          {
            start=true;
            s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
            for(Element ge:e.getChildren())
            {
              //Retrieve all the <a> for extracting the inGloss relation
              if(ge.getName().equals("a"))
              {
                String nurl=ge.getAttributeValue("href");
                s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));            
              }
            }
          }
          else
          {
            if(start)
              break;
          }
        }


      }
      else//Look for the start of the section
      {
        String all=xml.getRootElement().getValue();
        String section=all.substring(all.indexOf(this.redirect)+this.redirect.length());
        section=section.substring(0,section.indexOf("\")"));
        
        boolean start=false;
        for(Element e:body.getChildren())
        {
          if(start)
          {
            if(e.getName().equals("p"))
            {
              s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
              for(Element ge:e.getChildren())
              {
                //Retrieve all the <a> for extracting the inGloss relation
                if(ge.getName().equals("a"))
                {
                  String nurl=ge.getAttributeValue("href");
                  s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));            
                }
              }
            }
            else
            {
              break;
            }


          }
          else
          {
            if(e.getName().startsWith("h"))
            {
              for(Element es:e.getChildren())
              {
                if(es.getName().equals("span")&&section.equals(es.getAttributeValue("id")))
                {                
                  start=true;
                }
              }
            }
          }
          
        }
      }
      Element navbox=null;
      for(Element e:xml.getRootElement().getDescendants(new ElementFilter("table")))
      {
        if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("navbox"))
        {
            navbox=e;
            break;
        }
      }
      //Add inNavBox relations
      if(navbox!=null)
      {
        for(Element e:navbox.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inNavBox", new Relation("inNavBox", nurl.replace("/wiki/", ""), ""));
          }
        }
      }
      //Add in CatLinks relations
      Element catlinks=null;
      String aux="catlinks";
      for(Element e:body.getParent().getDescendants(new ElementFilter("div")))
      {
        if(aux.equals(e.getAttributeValue("id")))
        {
          catlinks=e;
          break;
        }
      }
      if(catlinks!=null)
      {
        for(Element e:catlinks.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inCatLinks", new Relation("inCatLinks", nurl.replace("/wiki/", ""), ""));
          }
        }
      }
      Util.writeObject(f, s);
    }
    return s;
  }
  
  /**
   * Tells if the current URL is an actual article.
   * Please modify this method for adding pages that you believe must be excluded or 
   * for adding support to a new language.
   * @param url Target URL.
   * @return True is the target page is not an article. 
   */
    public boolean isNotAnArticle(String url)
    {
      boolean ban=url.contains("Template:");
      ban=ban||url.contains("w/index.php?")||url.contains("Template_talk:")||url.contains("Help:");
      ban=ban||url.contains("Wikipedia:")||url.contains("File:")||url.contains(".");
      ban=ban||url.contains("Special:")||url.contains("#cite_note")||url.startsWith("#");
      ban=ban||url.contains("Especial:")||url.contains("Ayuda:")||url.contains("Category:");
      ban=ban||url.contains("Categor�a:")||url.contains("Talk:")||url.contains("Discusi�n:");
      ban=ban||url.contains("List_of")||url.contains("Lists_of")||url.contains(this.disambiguationWord);
      return ban;
    }
    /**
     * Method for creating an SGF file from a Wikipedia article.
     * Use this method for creating a corpus made from Wikipedia articles.
     * @param sid Name of the article as specified in the Wikipedia URL. 
     * E.g. "Gray_wolf", "Iron_man", etc. 
     * @param path Folder for saving the SGF file.
     * @throws Exception
     */
    @SuppressWarnings("unused")
  public void createInputFromArticle(String sid,String path)throws Exception
  {
      File f=new File(path);
      f.mkdirs();
      FileWriter fout=new FileWriter(path+sid+".sgf");
      BufferedWriter out=new BufferedWriter(fout);
    URL url=new URL(this.path+"wiki/"+sid);
    Document xml=this.loadURL(url);
    Element body=this.getContentNode(xml);    
    DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
    db.setPath(this.path);    
    db.load("Glosses");
    
    out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    out.write("<contextfile concordance=\""+this.name.replace("->", ".")+"\">\n");
    out.write("\t<context filename=\""+sid.replace("&","&amp;")+"\" paras=\"yes\">\n");    
    int p=1;
    int s=1;
    String paragraph="";
    String sentence="";
    paragraph+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
    sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
    ArrayList<Content> stack=new ArrayList<Content>();
    stack.addAll(body.getContent());
    while(stack.size()>0)
    {
      Content c=stack.get(0);
      stack.remove(0);
      if(c.getCType().equals(CType.Text))//actual text
      {
        //a dot creates a new sentence after processing
        String line=c.getValue().trim();
        while(!line.equals(""))
        {          
          int idx=line.indexOf(" ");
          String words;
          if(idx>=0)
            words=line.substring(0,idx);
          else
            words=line;
          line=line.substring(words.length()).trim();
          String punct=words.replaceAll("\\p{Punct}","�");
          int index=0;
          while(!punct.equals(""))
          {
            idx=punct.indexOf("�");
            String word;
            if(idx>=0)
              word=punct.substring(0,idx);
            else
              word=punct;
            if(word.equals(""))
            {
              //first the punctuation then the word
              //add a punc node
              
              if(words.charAt(index)=='<')
              {
                sentence+="\t\t\t\t<punc>&lt;</punc>\n";
              }
              else
              {
                if(words.charAt(index)=='>')
                  sentence+="\t\t\t\t<punc>&gt;</punc>\n";
                else
                  sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
              }
              if(words.charAt(index)=='.')
              {
                sentence+=("\t\t\t</s>\n");
                if(sentence.contains("wf"))
                {
                  System.out.print(".");
                  s++;
                  paragraph+=sentence;
                }
                sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
              }
              index++;
              punct=punct.substring(1);
            }
            else
            {
              index+=word.length();
              sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
              sentence+=word;
              sentence+="</wf>\n";
              punct=punct.substring(word.length());
            }
              
          }
            
        }
      }
      if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
      {
        Element current=(Element)c;
        //tr creates a new sentence after processing
        String href=current.getAttributeValue("href");
      
        String aux="navbox";
        if(aux.equals(current.getAttributeValue("class")))
          break;
        if(href!=null&&current.getName().equals("a")&&!this.isNotAnArticle(href)&&!href.contains("Category:"))
        {
          if(!href.contains("%25"))
          {
            while(href.contains("%"))
            {
              int index=href.indexOf("%");          
              String first=href.substring(0,index);
              if(index>href.length())
                index=href.length();
              String last=href.substring(index+3);
              String hex="0x"+href.substring(index+1,index+3);
              byte b[]; 
              if(last.startsWith("%"))
              {
                b=new byte[2];              
                b[0]=(byte)Integer.decode(hex).intValue();
                b[1]=(byte)Integer.decode("0x"+last.substring(1,3)).intValue();
                last=last.substring(3);
              }
              else
              {
                b=new byte[1];
                b[0]=(byte)Integer.decode(hex).intValue();
              }
              href=first+new String(b,"UTF-8")+last;
            }
          }
          //Lematize the wiki word
          String word=current.getValue();
          String lemma=word;
          Lemma l=db.getLemma(word);
          href=href.substring(href.indexOf("wiki/")+5);
          boolean ac=true;
          if(l==null)
          {
              l=db.getLemma(href);
              if(l!=null)
                lemma=l.getLemma();
              ac=false;
          }
          if(l!=null)
          {
            int i=0;
            boolean ban=false;
            for(Sense sense:l.getSenses())
            {
              i++;
              if(sense.getSid().equals(href))
              {
                ban=true;
                break;
              }
            }
            String wnsn="";
            if(ban)
            {
              wnsn=String.valueOf(i);
            }
            else
            {
              if(ac)
              {
                l=db.getLemma(href);
                if(l!=null)
                {
                  i=0;
                  ban=false;
                  for(Sense sense:l.getSenses())
                  {
                    i++;
                    if(sense.getSid().equals(href))
                    {
                      ban=true;
                      break;
                    }
                  }
                  if(ban)
                    wnsn=String.valueOf(i);
                }
              }
            }
            if(wnsn.equals("")&&l!=null)
            {
              Sense sense=this.getSense(href);
              ban=false;
              i=0;
              for(Sense sx:l.getSenses())
              {
                i++;
                if(sense.itContainsTheSameSamples(sx))
                {
                  ban=true;
                  break;
                }
              }
            }
            if(ban)
              wnsn=String.valueOf(i);
            if(wnsn.equals(""))
            {  stack.addAll(0,current.getContent());
              out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&amp;")+" -->\n");
            }
            else
            {
              sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
                  +"\" wnsn=\""+wnsn
                  +"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
              sentence+=word;
              sentence+="</wf>\n";
            }
          }
          else
          {
            stack.addAll(0,current.getContent());
          }
          
        }
        else
        {
          if(current.getName().equals("tr")||current.getName().equals("p"))
          {
            sentence+=("\t\t\t</s>\n");
            if(sentence.contains("wf"))
            {
              System.out.print(".");
              s++;
              paragraph+=sentence;
            }
            if(paragraph.contains("wf"))
            {
              System.out.println("Saving paragraph "+String.valueOf(p));
              p++;
              paragraph+=("\t\t</p>\n");
              out.write(paragraph.replace("&","&amp;"));
            }
            s=1;
            sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
            paragraph="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
          }
          stack.addAll(0,current.getContent());        
        }
      }
    }
    sentence+=("\t\t\t</s>\n");
    if(sentence.contains("wf"))
    {
      System.out.print(".");
      s++;
      paragraph+=sentence;
    }
    if(paragraph.contains("wf"))
    {
      System.out.println("Saving paragraph "+String.valueOf(p));
      p++;
      paragraph+=("\t\t</p>\n");
      out.write(paragraph.replace("&","&amp;"));
    }
    out.write("\t</context>\n");
    out.write("</contextfile>\n");
    out.close();
    fout.close();
  }


    /**
     * Returns the search hits of two simultaneous senses.
     * @param sense Target sense.
     * @param sense2 The other target sense.
     * @return The search hits of pages containing both senses.
     * @throws Exception
     */
  public double getCounts(Sense sense, Sense sense2) throws Exception{
    double w=0.0;
    String qry="\""+sense.getSid()+"\"";;
    if(!sense.equals(sense2))
    {
      qry+="+AND+"+"\""+sense2.getSid()+"\"";
    }
    URL count=new URL(this.path+"/w/index.php?title=Special%3ASearch&profile=default&search="+qry+"&fulltext=Search");    
    Document xml=this.loadURL(count);  
    if(xml!=null)
    {
      String aux="mw-search-formheader";
      for(Element e:this.getContentNode(xml).getDescendants(new ElementFilter("div")))
      {
        if(aux.equals(e.getAttributeValue("class")))
        {
          for(Element x:e.getDescendants(new ElementFilter("div")))
          {
            aux="results-info";
            if(aux.equals(x.getAttributeValue("class")))
            {
              aux=e.getValue().substring(e.getValue().indexOf(this.countText)+this.countText.length());;
              aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
              aux=aux.substring(0,aux.indexOf(this.countPrepOut));
              aux=aux.replaceAll("\\D", "");        
              w=Double.parseDouble(aux);
              break;
            }
          }
          break;
        }
      }
    }
    else
    {
      String aux=this.loadURLAsText(count);
      aux=aux.substring(aux.indexOf("mw-search-formheader"));
      aux=aux.substring(aux.indexOf("results-info"));
      aux=aux.substring(aux.indexOf(this.countPrepIn)+this.countPrepIn.length());
      aux=aux.substring(0,aux.indexOf(this.countPrepOut));
      aux=aux.replaceAll("\\D", "");
      w=Double.parseDouble(aux);
    }
    
    return w;
  }
  @Override
  public boolean doesLemmaExists(String lemma) throws Exception {
    return this.getLemma(lemma)!=null;
  }


}
Source Code of gannuNLP.dictionaries.Wiki

Related Classes of gannuNLP.dictionaries.Wiki