Package org.jdom2.input

Examples of org.jdom2.input.StAXEventBuilder

From a JDOM perspective XMLStreamReaders are more efficient than XMLEventReaders. Where possible use an XMLStreamReader.

If you happen to be looking at the source code, pay careful attention to the imports so you know what type of instance is being processed, whether it is a StAX class, or a JDOM class, because there are name conflicts. @author Rolf Lear


     */
    private Element getContentNode(Document xml)
    {
      //Search for mw-content-text
    //Gloss starts inside the first <p> nodes and ends with the apparition of another node type (like <div> or <h2>)
    Element root=xml.getRootElement();
    Element body=null;
    for(Element e:root.getChildren())
    {
      if(e.getName().equals("body"))
      {
        body=e;
        break;
      }
    }
    Element content=null;
    for(Element e:body.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttributeValue("id").equals("content"))
      {
        content=e;
        break;
      }
    }
    Element bodycontent=null;
    for(Element e:content.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("bodyContent"))
      {
        bodycontent=e;
        break;
      }
    }
    Element gloss=null;
    for(Element e:bodycontent.getChildren())
    {
      if(e.getAttribute("id")!=null&&e.getAttribute("id").getValue().equals("mw-content-text"))
      {
        gloss=e;
View Full Code Here


          Filter<? extends Content> or=f.or(new ElementFilter("span"));
          for(Content c:div.getDescendants(or))
          {
            if(c.getCType()==CType.Element)
            {
              Element word=(Element)c;
              if(word.getName().equals("a"))
              {
                String href=word.getAttributeValue("href");
                if(href!=null&&href.startsWith("/wiki"))
                {
                  urls.add(new URL(this.path+href));
                  hindex.add(new Integer(heads.size()-1));
                }
              }
              else
              {
                if(word.getAttributeValue("class")!=null && word.getAttributeValue("class").equals("mw-headline"))
                {
                  heads.add(word.getAttributeValue("id"));
                  Element e=word.getParentElement();
                  levels.add(new Integer(Integer.parseInt(e.getName().substring(1))-2));               
                }
              }
            }
          }
          break;
        }
      }
      ArrayList<Sense> senses=new ArrayList<Sense>(urls.size());
      for(int i=0;i<urls.size();i++)
      {
        String urlx=urls.get(i).getFile().replace("/wiki/", "");
        if(!this.isNotAnArticle(urlx))//it is a sense
        {                   
          senses.add(this.getSense(urlx.replace("/", "")));
       
      }
      ArrayList<Count> counts=new ArrayList<Count>();
      counts.add(this.getWikiCounts(lemma));      ;     
      l=new Lemma(lemma,"",senses,counts,this.name);
    }
    else
    {
      url=new URL(this.path+"wiki/"+lemma.replace(" ", "_"));
      xml=this.loadURL(url);
      if(xml!=null)
        text=xml.getRootElement().getValue();
      if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
      {
        if(text.contains(this.disambiguationMSG)&&!this.jump)
        {         
          this.jump=true;
          ArrayList<Sense> senses=new ArrayList<Sense>();
          Element body=this.getContentNode(xml);
          for(Element e:body.getChildren())
          {
           
            if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("dablink"))
            {
              for(Content c:e.getContent())
              {
                if(c.getCType().equals(CType.Element))
                {
                  Element a=((Element)c);
                  if(a.getName().equals("a"))
                  {
                    String sid=a.getAttributeValue("href");
                    sid=sid.substring(sid.indexOf("wiki/")+5);
                    sid=sid.replace(this.disambiguationWord, "");
                    Lemma ll=this.getLemma(sid);
                    if(ll!=null)
                    {
View Full Code Here

      ArrayList<String> syns=new ArrayList<String>(1);
      syns.add(sid);
      s=new Sense(sid,"",syns);
      URL url=new URL(this.path+"wiki/"+sid.replace(" ", "_"));
      Document xml=this.loadURL(url);
      Element body=this.getContentNode(xml);
      //Check if definition is a section
      if(!xml.getRootElement().getValue().contains(this.redirect))
      {
        boolean start=false;
       
        for(Element e:body.getChildren())
        {
          if(e.getName().equals("p"))
          {
            start=true;
            s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
            for(Element ge:e.getChildren())
            {
              //Retrieve all the <a> for extracting the inGloss relation
              if(ge.getName().equals("a"))
              {
                String nurl=ge.getAttributeValue("href");
                s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
              }
            }
          }
          else
          {
            if(start)
              break;
          }
        }

      }
      else//Look for the start of the section
      {
        String all=xml.getRootElement().getValue();
        String section=all.substring(all.indexOf(this.redirect)+this.redirect.length());
        section=section.substring(0,section.indexOf("\")"));
       
        boolean start=false;
        for(Element e:body.getChildren())
        {
          if(start)
          {
            if(e.getName().equals("p"))
            {
              s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
              for(Element ge:e.getChildren())
              {
                //Retrieve all the <a> for extracting the inGloss relation
                if(ge.getName().equals("a"))
                {
                  String nurl=ge.getAttributeValue("href");
                  s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
                }
              }
            }
            else
            {
              break;
            }

          }
          else
          {
            if(e.getName().startsWith("h"))
            {
              for(Element es:e.getChildren())
              {
                if(es.getName().equals("span")&&section.equals(es.getAttributeValue("id")))
                {               
                  start=true;
                }
              }
            }
          }
         
        }
      }
      Element navbox=null;
      for(Element e:xml.getRootElement().getDescendants(new ElementFilter("table")))
      {
        if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("navbox"))
        {
            navbox=e;
            break;
        }
      }
      //Add inNavBox relations
      if(navbox!=null)
      {
        for(Element e:navbox.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inNavBox", new Relation("inNavBox", nurl.replace("/wiki/", ""), ""));
          }
        }
      }
      //Add in CatLinks relations
      Element catlinks=null;
      String aux="catlinks";
      for(Element e:body.getParent().getDescendants(new ElementFilter("div")))
      {
        if(aux.equals(e.getAttributeValue("id")))
        {
          catlinks=e;
          break;
        }
      }
      if(catlinks!=null)
      {
        for(Element e:catlinks.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inCatLinks", new Relation("inCatLinks", nurl.replace("/wiki/", ""), ""));
View Full Code Here

      f.mkdirs();
      FileWriter fout=new FileWriter(path+sid+".sgf");
      BufferedWriter out=new BufferedWriter(fout);
    URL url=new URL(this.path+"wiki/"+sid);
    Document xml=this.loadURL(url);
    Element body=this.getContentNode(xml);   
    DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
    db.setPath(this.path);   
    db.load("Glosses");
   
    out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    out.write("<contextfile concordance=\""+this.name.replace("->", ".")+"\">\n");
    out.write("\t<context filename=\""+sid.replace("&","&amp;")+"\" paras=\"yes\">\n");   
    int p=1;
    int s=1;
    String paragraph="";
    String sentence="";
    paragraph+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
    sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
    ArrayList<Content> stack=new ArrayList<Content>();
    stack.addAll(body.getContent());
    while(stack.size()>0)
    {
      Content c=stack.get(0);
      stack.remove(0);
      if(c.getCType().equals(CType.Text))//actual text
      {
        //a dot creates a new sentence after processing
        String line=c.getValue().trim();
        while(!line.equals(""))
        {         
          int idx=line.indexOf(" ");
          String words;
          if(idx>=0)
            words=line.substring(0,idx);
          else
            words=line;
          line=line.substring(words.length()).trim();
          String punct=words.replaceAll("\\p{Punct}","�");
          int index=0;
          while(!punct.equals(""))
          {
            idx=punct.indexOf("�");
            String word;
            if(idx>=0)
              word=punct.substring(0,idx);
            else
              word=punct;
            if(word.equals(""))
            {
              //first the punctuation then the word
              //add a punc node
             
              if(words.charAt(index)=='<')
              {
                sentence+="\t\t\t\t<punc>&lt;</punc>\n";
              }
              else
              {
                if(words.charAt(index)=='>')
                  sentence+="\t\t\t\t<punc>&gt;</punc>\n";
                else
                  sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
              }
              if(words.charAt(index)=='.')
              {
                sentence+=("\t\t\t</s>\n");
                if(sentence.contains("wf"))
                {
                  System.out.print(".");
                  s++;
                  paragraph+=sentence;
                }
                sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
              }
              index++;
              punct=punct.substring(1);
            }
            else
            {
              index+=word.length();
              sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
              sentence+=word;
              sentence+="</wf>\n";
              punct=punct.substring(word.length());
            }
             
          }
           
        }
      }
      if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
      {
        Element current=(Element)c;
        //tr creates a new sentence after processing
        String href=current.getAttributeValue("href");
     
        String aux="navbox";
        if(aux.equals(current.getAttributeValue("class")))
          break;
        if(href!=null&&current.getName().equals("a")&&!this.isNotAnArticle(href)&&!href.contains("Category:"))
        {
          if(!href.contains("%25"))
          {
            while(href.contains("%"))
            {
              int index=href.indexOf("%");         
              String first=href.substring(0,index);
              if(index>href.length())
                index=href.length();
              String last=href.substring(index+3);
              String hex="0x"+href.substring(index+1,index+3);
              byte b[];
              if(last.startsWith("%"))
              {
                b=new byte[2];             
                b[0]=(byte)Integer.decode(hex).intValue();
                b[1]=(byte)Integer.decode("0x"+last.substring(1,3)).intValue();
                last=last.substring(3);
              }
              else
              {
                b=new byte[1];
                b[0]=(byte)Integer.decode(hex).intValue();
              }
              href=first+new String(b,"UTF-8")+last;
            }
          }
          //Lematize the wiki word
          String word=current.getValue();
          String lemma=word;
          Lemma l=db.getLemma(word);
          href=href.substring(href.indexOf("wiki/")+5);
          boolean ac=true;
          if(l==null)
          {
              l=db.getLemma(href);
              if(l!=null)
                lemma=l.getLemma();
              ac=false;
          }
          if(l!=null)
          {
            int i=0;
            boolean ban=false;
            for(Sense sense:l.getSenses())
            {
              i++;
              if(sense.getSid().equals(href))
              {
                ban=true;
                break;
              }
            }
            String wnsn="";
            if(ban)
            {
              wnsn=String.valueOf(i);
            }
            else
            {
              if(ac)
              {
                l=db.getLemma(href);
                if(l!=null)
                {
                  i=0;
                  ban=false;
                  for(Sense sense:l.getSenses())
                  {
                    i++;
                    if(sense.getSid().equals(href))
                    {
                      ban=true;
                      break;
                    }
                  }
                  if(ban)
                    wnsn=String.valueOf(i);
                }
              }
            }
            if(wnsn.equals("")&&l!=null)
            {
              Sense sense=this.getSense(href);
              ban=false;
              i=0;
              for(Sense sx:l.getSenses())
              {
                i++;
                if(sense.itContainsTheSameSamples(sx))
                {
                  ban=true;
                  break;
                }
              }
            }
            if(ban)
              wnsn=String.valueOf(i);
            if(wnsn.equals(""))
            {  stack.addAll(0,current.getContent());
              out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&amp;")+" -->\n");
            }
            else
            {
              sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
                  +"\" wnsn=\""+wnsn
                  +"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
              sentence+=word;
              sentence+="</wf>\n";
            }
          }
          else
          {
            stack.addAll(0,current.getContent());
          }
         
        }
        else
        {
          if(current.getName().equals("tr")||current.getName().equals("p"))
          {
            sentence+=("\t\t\t</s>\n");
            if(sentence.contains("wf"))
            {
              System.out.print(".");
              s++;
              paragraph+=sentence;
            }
            if(paragraph.contains("wf"))
            {
              System.out.println("Saving paragraph "+String.valueOf(p));
              p++;
              paragraph+=("\t\t</p>\n");
              out.write(paragraph.replace("&","&amp;"));
            }
            s=1;
            sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
            paragraph="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
          }
          stack.addAll(0,current.getContent());       
        }
      }
    }
    sentence+=("\t\t\t</s>\n");
    if(sentence.contains("wf"))
View Full Code Here

    return this.sq;
  }
 
  public void saveSQ(double sq)
  {
    Element thisSQ = new Element("sq");
    thisSQ.setText(sq+"");
    this.root.addContent(thisSQ);
    this.saveXML();
  }
View Full Code Here

      {
        String actualLanguage = (((Element) langList.getSelectedItem()).getName());
       
        String contentStr = content.getText();
       
        Element text = new Element(key);
        text.setText(contentStr);
       
        this.la.root.getChild(actualLanguage).addContent(text);
       
        this.saveXML();
       
View Full Code Here

   * </p>
   */
  private void bouttonSaveLangClicked()
  {
    String lang = langField.getText();
    this.la.root.addContent(new Element(lang));
    this.saveXML();
    this.newLangDialog.setVisible(false);
  }
View Full Code Here

          ArrayList<Element> words=new ArrayList<Element>();
          for(Element word:xml.getDescendants(new ElementFilter("wf")))
            words.add(word);
          for(int w=0;w<words.size();w++)
          {
            Element word=words.get(w);
            if(word.getAttribute("ot")!=null)
            {
              word.setAttribute("cmd", "ignore");
            }
            if((word.getAttribute("cmd").getValue().equals("done"))||(tag&&(word.getAttribute("cmd").getValue().equals("tag"))))
            {
              String lemma=word.getValue();
              String pos=word.getAttribute("pos").getValue().substring(0,1);
              String lem;
              Lemma l=null;
              if(word.getAttribute("cmd").getValue().equals("done"))
                { 
                if(word.getAttribute("lemma")!=null)
                {
                  lemma=word.getAttribute("lemma").getValue();
                    pos=word.getAttribute("pos").getValue().substring(0,1);
                    lem=lemma+"_"+pos;
                    l=dic.getLemma(lem);                 
                }
                else
                {
                  lemma="";
                  pos="X";
                }
                }
              lem=lemma+"_"+pos;
 
             
              if(l==null)//lemma may not be in normal form
              {
               
                SemCorCleaner.displayWindow(words, w);
                SemCorCleaner.readLemma(dic, lemma, word, pos);
                l=dic.getLemma(word.getAttributeValue("lemma")+"_"+word.getAttributeValue("pos").substring(0,1));
              }
              if(l!=null)
              {
                if(!SemCorCleaner.isValidWNSN(word.getAttributeValue("wnsn"), l.getSenses().size()))
                {
                  SemCorCleaner.displayWindow(words, w);
                  SemCorCleaner.checkSenses(word, l);
                }
              }
View Full Code Here

     @throws IOException For other kinds of errors.
     */

    public String translate( String html, XHtmlToWikiConfig config ) throws JDOMException, IOException
    {
        Element element = htmlStringToElement( html );
        XHtmlElementToWikiTranslator xhtmlTranslator = new XHtmlElementToWikiTranslator( element, config );
        String wikiMarkup = xhtmlTranslator.getWikiString();
        return wikiMarkup;
    }
View Full Code Here

     */
    private Element htmlStringToElement( String html ) throws JDOMException, IOException
    {
        SAXBuilder builder = new SAXBuilder( CYBERNEKO_PARSER, true );
        Document doc = builder.build( new StringReader( html ) );
        Element element = doc.getRootElement();
        return element;
    }
View Full Code Here

TOP

Related Classes of org.jdom2.input.StAXEventBuilder

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.