Package net.sf.joafip.entity.rel400

Examples of net.sf.joafip.entity.rel400.Element


     */
    private Element getContentNode(Document xml)
    {
      //Search for mw-content-text
    //Gloss starts inside the first <p> nodes and ends with the apparition of another node type (like <div> or <h2>)
    Element root=xml.getRootElement();
    Element body=null;
    for(Element e:root.getChildren())
    {
      if(e.getName().equals("body"))
      {
        body=e;
        break;
      }
    }
    Element content=null;
    for(Element e:body.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttributeValue("id").equals("content"))
      {
        content=e;
        break;
      }
    }
    Element bodycontent=null;
    for(Element e:content.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("bodyContent"))
      {
        bodycontent=e;
        break;
      }
    }
    Element gloss=null;
    for(Element e:bodycontent.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("mw-content-text"))
      {
        gloss=e;
View Full Code Here


          Filter<? extends Content> or=f.or(new ElementFilter("span"));
          for(Content c:div.getDescendants(or))
          {
            if(c.getCType()==CType.Element)
            {
              Element word=(Element)c;
              if(word.getName().equals("a"))
              {
                String href=word.getAttributeValue("href");
                if(href!=null&&href.startsWith("/wiki"))
                {
                  urls.add(new URL(this.path+href));
                  hindex.add(new Integer(heads.size()-1));
                }
              }
              else
              {
                if(word.getAttributeValue("class")!=null && word.getAttributeValue("class").equals("mw-headline"))
                {
                  heads.add(word.getAttributeValue("id"));
                  Element e=word.getParentElement();
                  levels.add(new Integer(Integer.parseInt(e.getName().substring(1))-2));               
                }
              }
            }
          }
          break;
        }
      }
      ArrayList<Sense> senses=new ArrayList<Sense>(urls.size());
      for(int i=0;i<urls.size();i++)
      {
        String urlx=urls.get(i).getFile().replace("/wiki/", "");
        if(!this.isNotAnArticle(urlx))//it is a sense
        {                   
          senses.add(this.getSense(urlx.replace("/", "")));
       
      }
      ArrayList<Count> counts=new ArrayList<Count>();
      counts.add(this.getWikiCounts(lemma));      ;     
      l=new Lemma(lemma,"",senses,counts,this.name);
    }
    else
    {
      url=new URL(this.path+"wiki/"+lemma.replace(" ", "_"));
      xml=this.loadURL(url);
      if(xml!=null)
        text=xml.getRootElement().getValue();
      if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
      {
        if(text.contains(this.disambiguationMSG)&&!this.jump)
        {         
          this.jump=true;
          ArrayList<Sense> senses=new ArrayList<Sense>();
          Element body=this.getContentNode(xml);
          for(Element e:body.getChildren())
          {
           
            if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("dablink"))
            {
              for(Content c:e.getContent())
              {
                if(c.getCType().equals(CType.Element))
                {
                  Element a=((Element)c);
                  if(a.getName().equals("a"))
                  {
                    String sid=a.getAttributeValue("href");
                    sid=sid.substring(sid.indexOf("wiki/")+5);
                    sid=sid.replace(this.disambiguationWord, "");
                    Lemma ll=this.getLemma(sid);
                    if(ll!=null)
                    {
View Full Code Here

      ArrayList<String> syns=new ArrayList<String>(1);
      syns.add(sid);
      s=new Sense(sid,"",syns);
      URL url=new URL(this.path+"wiki/"+sid.replace(" ", "_"));
      Document xml=this.loadURL(url);
      Element body=this.getContentNode(xml);
      //Check if definition is a section
      if(!xml.getRootElement().getValue().contains(this.redirect))
      {
        boolean start=false;
       
        for(Element e:body.getChildren())
        {
          if(e.getName().equals("p"))
          {
            start=true;
            s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
            for(Element ge:e.getChildren())
            {
              //Retrieve all the <a> for extracting the inGloss relation
              if(ge.getName().equals("a"))
              {
                String nurl=ge.getAttributeValue("href");
                s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
              }
            }
          }
          else
          {
            if(start)
              break;
          }
        }

      }
      else//Look for the start of the section
      {
        String all=xml.getRootElement().getValue();
        String section=all.substring(all.indexOf(this.redirect)+this.redirect.length());
        section=section.substring(0,section.indexOf("\")"));
       
        boolean start=false;
        for(Element e:body.getChildren())
        {
          if(start)
          {
            if(e.getName().equals("p"))
            {
              s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
              for(Element ge:e.getChildren())
              {
                //Retrieve all the <a> for extracting the inGloss relation
                if(ge.getName().equals("a"))
                {
                  String nurl=ge.getAttributeValue("href");
                  s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
                }
              }
            }
            else
            {
              break;
            }

          }
          else
          {
            if(e.getName().startsWith("h"))
            {
              for(Element es:e.getChildren())
              {
                if(es.getName().equals("span")&&section.equals(es.getAttributeValue("id")))
                {               
                  start=true;
                }
              }
            }
          }
         
        }
      }
      Element navbox=null;
      for(Element e:xml.getRootElement().getDescendants(new ElementFilter("table")))
      {
        if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("navbox"))
        {
            navbox=e;
            break;
        }
      }
      //Add inNavBox relations
      if(navbox!=null)
      {
        for(Element e:navbox.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inNavBox", new Relation("inNavBox", nurl.replace("/wiki/", ""), ""));
          }
        }
      }
      //Add in CatLinks relations
      Element catlinks=null;
      String aux="catlinks";
      for(Element e:body.getParent().getDescendants(new ElementFilter("div")))
      {
        if(aux.equals(e.getAttributeValue("id")))
        {
          catlinks=e;
          break;
        }
      }
      if(catlinks!=null)
      {
        for(Element e:catlinks.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inCatLinks", new Relation("inCatLinks", nurl.replace("/wiki/", ""), ""));
View Full Code Here

      f.mkdirs();
      FileWriter fout=new FileWriter(path+sid+".sgf");
      BufferedWriter out=new BufferedWriter(fout);
    URL url=new URL(this.path+"wiki/"+sid);
    Document xml=this.loadURL(url);
    Element body=this.getContentNode(xml);   
    DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
    db.setPath(this.path);   
    db.load("Glosses");
   
    out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    out.write("<contextfile concordance=\""+this.name.replace("->", ".")+"\">\n");
    out.write("\t<context filename=\""+sid.replace("&","&amp;")+"\" paras=\"yes\">\n");   
    int p=1;
    int s=1;
    String paragraph="";
    String sentence="";
    paragraph+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
    sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
    ArrayList<Content> stack=new ArrayList<Content>();
    stack.addAll(body.getContent());
    while(stack.size()>0)
    {
      Content c=stack.get(0);
      stack.remove(0);
      if(c.getCType().equals(CType.Text))//actual text
      {
        //a dot creates a new sentence after processing
        String line=c.getValue().trim();
        while(!line.equals(""))
        {         
          int idx=line.indexOf(" ");
          String words;
          if(idx>=0)
            words=line.substring(0,idx);
          else
            words=line;
          line=line.substring(words.length()).trim();
          String punct=words.replaceAll("\\p{Punct}","�");
          int index=0;
          while(!punct.equals(""))
          {
            idx=punct.indexOf("�");
            String word;
            if(idx>=0)
              word=punct.substring(0,idx);
            else
              word=punct;
            if(word.equals(""))
            {
              //first the punctuation then the word
              //add a punc node
             
              if(words.charAt(index)=='<')
              {
                sentence+="\t\t\t\t<punc>&lt;</punc>\n";
              }
              else
              {
                if(words.charAt(index)=='>')
                  sentence+="\t\t\t\t<punc>&gt;</punc>\n";
                else
                  sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
              }
              if(words.charAt(index)=='.')
              {
                sentence+=("\t\t\t</s>\n");
                if(sentence.contains("wf"))
                {
                  System.out.print(".");
                  s++;
                  paragraph+=sentence;
                }
                sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
              }
              index++;
              punct=punct.substring(1);
            }
            else
            {
              index+=word.length();
              sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
              sentence+=word;
              sentence+="</wf>\n";
              punct=punct.substring(word.length());
            }
             
          }
           
        }
      }
      if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
      {
        Element current=(Element)c;
        //tr creates a new sentence after processing
        String href=current.getAttributeValue("href");
     
        String aux="navbox";
        if(aux.equals(current.getAttributeValue("class")))
          break;
        if(href!=null&&current.getName().equals("a")&&!this.isNotAnArticle(href)&&!href.contains("Category:"))
        {
          if(!href.contains("%25"))
          {
            while(href.contains("%"))
            {
              int index=href.indexOf("%");         
              String first=href.substring(0,index);
              if(index>href.length())
                index=href.length();
              String last=href.substring(index+3);
              String hex="0x"+href.substring(index+1,index+3);
              byte b[];
              if(last.startsWith("%"))
              {
                b=new byte[2];             
                b[0]=(byte)Integer.decode(hex).intValue();
                b[1]=(byte)Integer.decode("0x"+last.substring(1,3)).intValue();
                last=last.substring(3);
              }
              else
              {
                b=new byte[1];
                b[0]=(byte)Integer.decode(hex).intValue();
              }
              href=first+new String(b,"UTF-8")+last;
            }
          }
          //Lematize the wiki word
          String word=current.getValue();
          String lemma=word;
          Lemma l=db.getLemma(word);
          href=href.substring(href.indexOf("wiki/")+5);
          boolean ac=true;
          if(l==null)
          {
              l=db.getLemma(href);
              if(l!=null)
                lemma=l.getLemma();
              ac=false;
          }
          if(l!=null)
          {
            int i=0;
            boolean ban=false;
            for(Sense sense:l.getSenses())
            {
              i++;
              if(sense.getSid().equals(href))
              {
                ban=true;
                break;
              }
            }
            String wnsn="";
            if(ban)
            {
              wnsn=String.valueOf(i);
            }
            else
            {
              if(ac)
              {
                l=db.getLemma(href);
                if(l!=null)
                {
                  i=0;
                  ban=false;
                  for(Sense sense:l.getSenses())
                  {
                    i++;
                    if(sense.getSid().equals(href))
                    {
                      ban=true;
                      break;
                    }
                  }
                  if(ban)
                    wnsn=String.valueOf(i);
                }
              }
            }
            if(wnsn.equals("")&&l!=null)
            {
              Sense sense=this.getSense(href);
              ban=false;
              i=0;
              for(Sense sx:l.getSenses())
              {
                i++;
                if(sense.itContainsTheSameSamples(sx))
                {
                  ban=true;
                  break;
                }
              }
            }
            if(ban)
              wnsn=String.valueOf(i);
            if(wnsn.equals(""))
            {  stack.addAll(0,current.getContent());
              out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&amp;")+" -->\n");
            }
            else
            {
              sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
                  +"\" wnsn=\""+wnsn
                  +"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
              sentence+=word;
              sentence+="</wf>\n";
            }
          }
          else
          {
            stack.addAll(0,current.getContent());
          }
         
        }
        else
        {
          if(current.getName().equals("tr")||current.getName().equals("p"))
          {
            sentence+=("\t\t\t</s>\n");
            if(sentence.contains("wf"))
            {
              System.out.print(".");
              s++;
              paragraph+=sentence;
            }
            if(paragraph.contains("wf"))
            {
              System.out.println("Saving paragraph "+String.valueOf(p));
              p++;
              paragraph+=("\t\t</p>\n");
              out.write(paragraph.replace("&","&amp;"));
            }
            s=1;
            sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
            paragraph="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
          }
          stack.addAll(0,current.getContent());       
        }
      }
    }
    sentence+=("\t\t\t</s>\n");
    if(sentence.contains("wf"))
View Full Code Here

    return this.sq;
  }
 
  public void saveSQ(double sq)
  {
    Element thisSQ = new Element("sq");
    thisSQ.setText(sq+"");
    this.root.addContent(thisSQ);
    this.saveXML();
  }
View Full Code Here

      {
        String actualLanguage = (((Element) langList.getSelectedItem()).getName());
       
        String contentStr = content.getText();
       
        Element text = new Element(key);
        text.setText(contentStr);
       
        this.la.root.getChild(actualLanguage).addContent(text);
       
        this.saveXML();
       
View Full Code Here

   * </p>
   */
  private void bouttonSaveLangClicked()
  {
    String lang = langField.getText();
    this.la.root.addContent(new Element(lang));
    this.saveXML();
    this.newLangDialog.setVisible(false);
  }
View Full Code Here

          ArrayList<Element> words=new ArrayList<Element>();
          for(Element word:xml.getDescendants(new ElementFilter("wf")))
            words.add(word);
          for(int w=0;w<words.size();w++)
          {
            Element word=words.get(w);
            if(word.getAttribute("ot")!=null)
            {
              word.setAttribute("cmd", "ignore");
            }
            if((word.getAttribute("cmd").getValue().equals("done"))||(tag&&(word.getAttribute("cmd").getValue().equals("tag"))))
            {
              String lemma=word.getValue();
              String pos=word.getAttribute("pos").getValue().substring(0,1);
              String lem;
              Lemma l=null;
              if(word.getAttribute("cmd").getValue().equals("done"))
                { 
                if(word.getAttribute("lemma")!=null)
                {
                  lemma=word.getAttribute("lemma").getValue();
                    pos=word.getAttribute("pos").getValue().substring(0,1);
                    lem=lemma+"_"+pos;
                    l=dic.getLemma(lem);                 
                }
                else
                {
                  lemma="";
                  pos="X";
                }
                }
              lem=lemma+"_"+pos;
 
             
              if(l==null)//lemma may not be in normal form
              {
               
                SemCorCleaner.displayWindow(words, w);
                SemCorCleaner.readLemma(dic, lemma, word, pos);
                l=dic.getLemma(word.getAttributeValue("lemma")+"_"+word.getAttributeValue("pos").substring(0,1));
              }
              if(l!=null)
              {
                if(!SemCorCleaner.isValidWNSN(word.getAttributeValue("wnsn"), l.getSenses().size()))
                {
                  SemCorCleaner.displayWindow(words, w);
                  SemCorCleaner.checkSenses(word, l);
                }
              }
View Full Code Here

     @throws IOException For other kinds of errors.
     */

    public String translate( String html, XHtmlToWikiConfig config ) throws JDOMException, IOException
    {
        Element element = htmlStringToElement( html );
        XHtmlElementToWikiTranslator xhtmlTranslator = new XHtmlElementToWikiTranslator( element, config );
        String wikiMarkup = xhtmlTranslator.getWikiString();
        return wikiMarkup;
    }
View Full Code Here

     */
    private Element htmlStringToElement( String html ) throws JDOMException, IOException
    {
        SAXBuilder builder = new SAXBuilder( CYBERNEKO_PARSER, true );
        Document doc = builder.build( new StringReader( html ) );
        Element element = doc.getRootElement();
        return element;
    }
View Full Code Here

TOP

Related Classes of net.sf.joafip.entity.rel400.Element

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.