Package org.htmlcleaner

Examples of org.htmlcleaner.TagNode


  }
 
  public void recherche(String lien) throws MalformedURLException, IOException, XPatherException
  {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(new URL(lien));
    info = new String();
    int count = 0;
   
    for (Object o : node.evaluateXPath("//body//p"))
    {
      if(count > && count < 4){
        info = info + ((TagNode)(o)).getText() + "\n";
      }
      count++;
View Full Code Here


    props.setTranslateSpecialEntities(true);
    props.setTransSpecialEntitiesToNCR(true);
   
    HtmlCleaner cleaner = new HtmlCleaner(props);
     
    TagNode node = cleaner.clean(new URL(lien));
    //System.out.println("Title: " + ((TagNode)(node.evaluateXPath("//title")[0])).getText());
    for (Object o : node.evaluateXPath("//ul[@id='acces_1']//li/a"))
    {
      String dUrl = ((TagNode)(o)).getAttributeByName("href");
      //System.out.println("LI: " + org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(((TagNode)(o)).getText().toString()));
      //System.out.println("href: "+dUrl);
      ajout = "LI: " + org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(((TagNode)(o)).getText().toString())+"\n";
View Full Code Here

    props.setTransSpecialEntitiesToNCR(true);
    props.setOmitComments(true);
     
    HtmlCleaner cleaner = new HtmlCleaner(props);

    TagNode node = cleaner.clean(new URL(lien));
   
    for (Object o : node.evaluateXPath("//div[@id='retour_accueil']/a/img"))
    {
      //System.out.println(((TagNode)(o)).getAllChildren());
      lien_logo = ((TagNode)(o)).getAttributeByName("src");
      //System.out.println("lien_logo "+lien_logo);
    }
   
    String temp2[] = new String[10];
    for (Object o : node.evaluateXPath("//style"))
    {
      //System.out.println(((TagNode)(o)).getAllChildren());
      for(Object temp: ((TagNode)(o)).getAllChildren())
      {
        if(temp.toString().contains("}"))
View Full Code Here

    props.setOmitComments(true);
     
    HtmlCleaner cleaner = new HtmlCleaner(props);
     
    int i=0;
    TagNode node = cleaner.clean(new URL(lien));
   
    for (Object o : node.evaluateXPath("//div[@class='encadre_fiche firstencadre']/div/div/a"))
    {
      lien_site_spe = ((TagNode)(o)).getAttributeByName("href");
      //System.out.println("lien spe "+lien_site_spe);
    }
   
    for (Object o : node.evaluateXPath("//div[@id='infos_generales']/table/tbody/tr"))
    {
      for(Object temp: ((TagNode)(o)).getAllChildren())
      {
        if(temp.toString().contains("th"))
        {
          for(Object temp2: ((TagNode)(temp)).getAllChildren())
          {
            info_principal[i][0] = org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
          }
        }
        if(temp.toString().contains("td"))
        {
          for(Object temp2: ((TagNode)(temp)).getAllChildren())
          {
            info_principal[i][1] = org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
            i++;
          }
        }
      }
    }
   
    i=0;
    for (Object o : node.evaluateXPath("//div[@class='element_deco'][2]"))
    {
      //System.out.println("description "+((TagNode)(o)).getAllChildren());
      for(Object temp: ((TagNode)(o)).getAllChildren())
      {
        if(temp.toString().contains("strong"))
        {
          for(Object temp2: ((TagNode)(temp)).getAllChildren())
          {
            description[i][0] =org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
          }
        }
        else if(temp.toString().equals("div"))
        {
          for(Object temp2: ((TagNode)(temp)).getAllChildren())
          {
            if(temp2.toString().equals("font"))
            {
              for(Object temp3: ((TagNode)(temp2)).getAllChildren())
              {
                if(temp3.toString().equals("strong"))
                {
                  for(Object temp4: ((TagNode)(temp3)).getAllChildren())
                  {
                    //System.out.println("description strong "+temp4.toString());
                    description[i][0] =org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
                    i++;
                  }
                }
                else if(temp3.toString().length()>=5)
                {
                  //System.out.println("description font "+temp3.toString());
                  description[i][0] =org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
                  i++;
                }
              }
            }
            else if(temp2.toString().length()>=5)
            {
              //System.out.println("description div "+temp2.toString());
              description[i][0] =org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
              i++;
            }
          }
        }
        else if(temp.toString().equals("p"))
        {
          for(Object temp2: ((TagNode)(temp)).getAllChildren())
          {
            if(temp2.toString().equals("strong"))
            {
              for(Object temp3: ((TagNode)(temp2)).getAllChildren())
              {
                //System.out.println("description 3 "+temp3.toString());
                break;
              }
            }
            else if(temp2.toString().length()>=5)
            {
              //System.out.println("description i "+i+" "+temp2.toString());
              description[i][0] =org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp2.toString());
              i++;
              break;
            }
          }
        }
        else if(temp.toString().length()>=5)
        {
          description[i][1] = org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(temp.toString());
          break;
        }
      }
    }
   
    i=0;
    int ok=0;
    for (Object o : node.evaluateXPath("//div[@class='encadre_fiche']"))
    {
      for(Object temp: ((TagNode)(o)).getAllChildren())
      {
        if(ok == 1)
        {
View Full Code Here

 
 
  public boolean recherche(String mot, String lien) throws MalformedURLException, IOException, XPatherException
  {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(new URL(lien));
    boolean bool = false;

    String[] decoupage = mot.split(" ");
    if(decoupage.length == 1)
    {
      for (Object o : node.evaluateXPath("//table[@class='clFlatTable']//tr//td"))
      {
        if(((((TagNode)(o)).getText()).toString()).toLowerCase().equals(mot.toLowerCase()))
        {
          bool = true;
          int i = 0;
          String [] tabl = new String[6];
          for(Object k : ((TagNode)(o)).getParent().evaluateXPath("td"))
          {
            String str = ((TagNode)(k)).getText().toString();
            tabl[i] = str;
            i++;
          }
          resultats.addElement(tabl);
       
      }
    }
    else
    {
      test = false;
      for (Object o : node.evaluateXPath("//table[@class='clFlatTable']//tr//td"))
      {
        if(((((TagNode)(o)).getText()).toString()).toLowerCase().equals(decoupage[0].toLowerCase()))
        {
          int tag = -1;
          if(((((TagNode)(o)).getText()).toString()).toLowerCase().equals(decoupage[0].toLowerCase()))
View Full Code Here

 
  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectScript(java.lang.String)
   */
  public void injectScript(String script) {
    TagNode js = new TagNode(SCRIPT_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(SRC_ATTRIBUTE, script);
    headNode.addChild(js);
  }
View Full Code Here

  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectStylesheet(java.lang.String)
   */
  public void injectStylesheet(String stylesheet) {
    TagNode js = new TagNode(LINK_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, CSS_TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(REL_ATTRIBUTE, CSS_REL_ATTRIBUTE_VALUE);
    js.addAttribute(HREF_ATTRIBUTE, stylesheet);
    headNode.addChild(js);
  }
View Full Code Here

   
    //
    // Check if the page already has a META http-equiv=content-type tag,
    // if it doesn't create one and add it to the head node
    //
    TagNode meta = headNode.findElementByAttValue("http-equiv", "content-type", true, false);
    if (meta == null) {
      meta = new TagNode(META_TAG);
      meta.addAttribute("http-equiv", "Content-Type");
      headNode.getChildren().add(0, meta);
    }
    //
    // Force UTF into lowercase
    //
    if (charset.equals("UTF-8")) charset = "utf-8";
   
    //
    // Override the charset and content-type values for the
    // META http-equiv=content-type tag
    //
    meta.addAttribute("content", type + ";charset=" + charset);
  }
View Full Code Here

   * @param callback
   * @return
   */
  private static Node traverse(BaseToken root, int depth, Callback callback) {
    if (root instanceof TagNode) {
      TagNode tn = (TagNode) root;

      int height = -1;
      int hash = HashCodeUtil.SEED;

      hash = HashCodeUtil.hash(hash, tn.getName());
      hash = HashCodeUtil.hash(hash, tn.getAttributes().toString());

      for (Object n : tn.getChildren()) {
        if (isInteresting(n)) {
          Node c = traverse((BaseToken) n, depth + 1, callback);
          hash = HashCodeUtil.hash(hash, c);
          if (c.height > height)
            height = c.height;
View Full Code Here

   * @throws TemplateNotFoundException
   *             if this template doesn't match the given document.
   * @throws IOException
   */
  public String clean(String document) throws IOException {
    TagNode root = parse(document);
    Traverser.traverse(root, new Pruner(mapping));
    return write(root);
  }
View Full Code Here

TOP

Related Classes of org.htmlcleaner.TagNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.