Package org.jdom2

Examples of org.jdom2.Document


  public void loadCoreData() throws Exception {
    URL url=new URL(this.path);   
    this.glossCount=0.0;
    try
    {
      Document xml=this.loadURL(url);
      if(language.equals("en"))
      {
        for(Element div:xml.getDescendants(new ElementFilter("a")))
        {
          if(div.getAttributeValue("href")!=null&&div.getAttributeValue("href").equals("/wiki/Special:Statistics"))
          {
            String str=div.getValue();
            System.out.println(str);
            this.glossCount=Double.parseDouble(str.replace(",", ""));
            break;
          }
        }
      }
      else
      {
        for(Element div:xml.getDescendants(new ElementFilter("li")))
        {
          if(div.getAttributeValue("id")!=null && div.getAttributeValue("id").contains("lang-"))
          {
            for(Element span:div.getDescendants(new ElementFilter("span")))
            {
              if(span.getAttributeValue("lang").equals(language))
              {
                String str=div.getValue();
                str=str.substring(0,str.indexOf(" articles"));
                str=str.split("More than ")[1];
                System.out.println(str);
                this.glossCount=Double.parseDouble(str.replace(",", ""));
              }
            }
          }
        }
      }
      if(language.equals("es"))
      {
        String f=xml.getRootElement().getValue();
        f=f.substring(f.indexOf(" art�culos en espa�ol")-30,f.indexOf("art�culos en espa�ol")).split("\n")[1].replace("\u00a0","");
        this.glossCount=Double.parseDouble(f);
      }
      this.wordCount=this.glossCount*320.0;
      url=new URL(this.path);
View Full Code Here


  Lemma l=null;
 
  int iidx=this.badSearches.indexOf(lemma);
  if(iidx<0)
  {
    Document xml=this.loadURL(url);
    String text="";
    if(xml!=null)
      text=xml.getRootElement().getValue();
    if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
    {
      ArrayList<URL> urls=new ArrayList<URL>();
      ArrayList<Integer> levels=new ArrayList<Integer>();
      ArrayList<Integer> hindex=new ArrayList<Integer>();
      ArrayList<String> heads=new ArrayList<String>();
      for(Element div:xml.getDescendants(new ElementFilter("div")))
      {
        if(div.getAttributeValue("id")!=null&&div.getAttributeValue("id").equals("content"))
        {
          ElementFilter f=new ElementFilter("a");
          Filter<? extends Content> or=f.or(new ElementFilter("span"));
          for(Content c:div.getDescendants(or))
          {
            if(c.getCType()==CType.Element)
            {
              Element word=(Element)c;
              if(word.getName().equals("a"))
              {
                String href=word.getAttributeValue("href");
                if(href!=null&&href.startsWith("/wiki"))
                {
                  urls.add(new URL(this.path+href));
                  hindex.add(new Integer(heads.size()-1));
                }
              }
              else
              {
                if(word.getAttributeValue("class")!=null && word.getAttributeValue("class").equals("mw-headline"))
                {
                  heads.add(word.getAttributeValue("id"));
                  Element e=word.getParentElement();
                  levels.add(new Integer(Integer.parseInt(e.getName().substring(1))-2));               
                }
              }
            }
          }
          break;
        }
      }
      ArrayList<Sense> senses=new ArrayList<Sense>(urls.size());
      for(int i=0;i<urls.size();i++)
      {
        String urlx=urls.get(i).getFile().replace("/wiki/", "");
        if(!this.isNotAnArticle(urlx))//it is a sense
        {                   
          senses.add(this.getSense(urlx.replace("/", "")));
       
      }
      ArrayList<Count> counts=new ArrayList<Count>();
      counts.add(this.getWikiCounts(lemma));      ;     
      l=new Lemma(lemma,"",senses,counts,this.name);
    }
    else
    {
      url=new URL(this.path+"wiki/"+lemma.replace(" ", "_"));
      xml=this.loadURL(url);
      if(xml!=null)
        text=xml.getRootElement().getValue();
      if(xml!=null&&!text.contains(this.missingMSG)&&!text.contains(this.wikiErrorMSG))
      {
        if(text.contains(this.disambiguationMSG)&&!this.jump)
        {         
          this.jump=true;
View Full Code Here

   * @return The HTML document.
   * @throws Exception
   */
  Document loadURL(URL url) throws Exception{
    SAXBuilder builder=new SAXBuilder();
    Document xml=null;
    HttpURLConnection con=(HttpURLConnection) url.openConnection();
    con.setInstanceFollowRedirects(true);
   
    InputStream in;
    try
View Full Code Here

   * @throws Exception
   */
  public Count getWikiCounts(String lemma) throws Exception{
    double w=0.0;
    URL count=new URL(this.path+"/w/index.php?title=Special%3ASearch&profile=default&search="+lemma.replace(" ", "%20")+"&fulltext=Search");   
    Document xml=this.loadURL(count)
    if(xml!=null)
    {
      String aux="mw-search-formheader";
      for(Element e:this.getContentNode(xml).getDescendants(new ElementFilter("div")))
      {
View Full Code Here

    {
      ArrayList<String> syns=new ArrayList<String>(1);
      syns.add(sid);
      s=new Sense(sid,"",syns);
      URL url=new URL(this.path+"wiki/"+sid.replace(" ", "_"));
      Document xml=this.loadURL(url);
      Element body=this.getContentNode(xml);
      //Check if definition is a section
      if(!xml.getRootElement().getValue().contains(this.redirect))
      {
        boolean start=false;
       
        for(Element e:body.getChildren())
        {
          if(e.getName().equals("p"))
          {
            start=true;
            s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
            for(Element ge:e.getChildren())
            {
              //Retrieve all the <a> for extracting the inGloss relation
              if(ge.getName().equals("a"))
              {
                String nurl=ge.getAttributeValue("href");
                s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
              }
            }
          }
          else
          {
            if(start)
              break;
          }
        }

      }
      else//Look for the start of the section
      {
        String all=xml.getRootElement().getValue();
        String section=all.substring(all.indexOf(this.redirect)+this.redirect.length());
        section=section.substring(0,section.indexOf("\")"));
       
        boolean start=false;
        for(Element e:body.getChildren())
        {
          if(start)
          {
            if(e.getName().equals("p"))
            {
              s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
              for(Element ge:e.getChildren())
              {
                //Retrieve all the <a> for extracting the inGloss relation
                if(ge.getName().equals("a"))
                {
                  String nurl=ge.getAttributeValue("href");
                  s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
                }
              }
            }
            else
            {
              break;
            }

          }
          else
          {
            if(e.getName().startsWith("h"))
            {
              for(Element es:e.getChildren())
              {
                if(es.getName().equals("span")&&section.equals(es.getAttributeValue("id")))
                {               
                  start=true;
                }
              }
            }
          }
         
        }
      }
      Element navbox=null;
      for(Element e:xml.getRootElement().getDescendants(new ElementFilter("table")))
      {
        if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("navbox"))
        {
            navbox=e;
            break;
View Full Code Here

      File f=new File(path);
      f.mkdirs();
      FileWriter fout=new FileWriter(path+sid+".sgf");
      BufferedWriter out=new BufferedWriter(fout);
    URL url=new URL(this.path+"wiki/"+sid);
    Document xml=this.loadURL(url);
    Element body=this.getContentNode(xml);   
    DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
    db.setPath(this.path);   
    db.load("Glosses");
   
View Full Code Here

    if(!sense.equals(sense2))
    {
      qry+="+AND+"+"\""+sense2.getSid()+"\"";
    }
    URL count=new URL(this.path+"/w/index.php?title=Special%3ASearch&profile=default&search="+qry+"&fulltext=Search");   
    Document xml=this.loadURL(count)
    if(xml!=null)
    {
      String aux="mw-search-formheader";
      for(Element e:this.getContentNode(xml).getDescendants(new ElementFilter("div")))
      {
View Full Code Here

        System.out.println("-----------Checking file: "+file.getName()+ " "+String.valueOf(x)+"/"+String.valueOf(files.size())+"--------------");
        x++;
        try
        {
          SAXBuilder builder=new SAXBuilder();
          Document xml=(Document)builder.build(file);
          ArrayList<Element> words=new ArrayList<Element>();
          for(Element word:xml.getDescendants(new ElementFilter("wf")))
            words.add(word);
          for(int w=0;w<words.size();w++)
          {
            Element word=words.get(w);
            if(word.getAttribute("ot")!=null)
View Full Code Here

     * @throws IOException
     */
    private Element htmlStringToElement( String html ) throws JDOMException, IOException
    {
        SAXBuilder builder = new SAXBuilder( CYBERNEKO_PARSER, true );
        Document doc = builder.build( new StringReader( html ) );
        Element element = doc.getRootElement();
        return element;
    }
View Full Code Here

     @param element The element to get HTML from.
     *  @return HTML
     */
    public static String element2String( Element element )
    {
        Document document = new Document( element );
        XMLOutputter outputter = new XMLOutputter();
        return outputter.outputString( document );
    }
View Full Code Here

TOP

Related Classes of org.jdom2.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.