Package org.htmlcleaner

Examples of org.htmlcleaner.TagNode


    private String clean(String dirtyHtml) {
        try {
            HtmlCleaner cleaner = new HtmlCleaner();

            TagNode root = cleaner.clean(dirtyHtml);

            return new SimpleHtmlSerializer(cleaner.getProperties()).getAsString(root);
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }
View Full Code Here


        String rawData = StringEscapeUtils.unescapeHtml4(source);
           
        HtmlCleaner cleaner = new HtmlCleaner();
        //CleanerProperties props = cleaner.getProperties();        
        //props.setXXX(...);
        TagNode node = cleaner.clean(rawData);
        TagNode[] myNodes;
       
        myNodes = node.getElementsByName("a", true);
        for (int i=0;i<myNodes.length;i++)
        {
            String href = myNodes[i].getAttributeByName("href");
            String anchorText = myNodes[i].getText().toString();
            doc.addElement("/job", "actonia_link", href + "|" + anchorText);
        }
        doc.addElement("/job", "actonia_link_count", String.valueOf(myNodes.length));

        myNodes = node.getElementsByName("h1", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h1", myNodes[i].getText().toString());
        }

        myNodes = node.getElementsByName("h2", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h2", myNodes[i].getText().toString());
        }

        myNodes = node.getElementsByName("h3", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h3", myNodes[i].getText().toString());
        }

        myNodes = node.getElementsByName("h4", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h4", myNodes[i].getText().toString());
        }
View Full Code Here

    final HashMap<String, String> m = new HashMap<String, String>();

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);
    TagNode[] myNodes;

    // <meta name="..." content="..." />
    // <meta http-equiv="refresh" content=
    myNodes = node.getElementsByName("meta", true);
    for (int i=0;i<myNodes.length;i++)
    {
      String name = myNodes[i].getAttributeByName("name");
      if (name!=null)
      {
        String scheme = myNodes[i].getAttributeByName("scheme");
        if (scheme!=null)
          name += "_" + scheme;

        String content = myNodes[i].getAttributeByName("content");
        if (content!=null && !"".equals(content))
        {
          m.put("meta_" + name.toLowerCase().replaceAll("\\-", "_"), content);
        }
      }

      String equiv = myNodes[i].getAttributeByName("http-equiv");
      if (equiv!=null)
      {
        String content = myNodes[i].getAttributeByName("content");
        if (content!=null && !"".equals(content))
        {
          m.put("meta_equiv_" + equiv.toLowerCase().replaceAll("\\-", "_"), content);
        }
      }
    }

    // <link ... />
    myNodes = node.getElementsByName("link", true);
    for (int i=0;i<myNodes.length;i++)
    {
      String href = myNodes[i].getAttributeByName("href");
      String rel = myNodes[i].getAttributeByName("rel");
      if (href!=null && rel!=null && "canonical".equals(rel))
View Full Code Here

    if (rawPage==null || !StringUtils.containsIgnoreCase(rawPage, "<base")) return null;

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);
    TagNode[] myNodes = node.getElementsByName("base", true);
    if (myNodes==null || myNodes.length==0) return null;
    String href = myNodes[0].getAttributeByName("href");
    if (href!=null) return href;
    return null;
  }
View Full Code Here

    final ArrayList<String> list = new ArrayList<String>();

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);

    TagNode[] myNodes;

    if (depth==1 || depth==2) {
      // <a href=
      myNodes = node.getElementsByName("a", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("href");

        if (link!=null) {
          link = link.trim();

          if (link!=null && !"".equals(link))
          {
            if (isValidUrl(link))
              if (!list.contains(link))
                list.add(link);
          }
        }
      }

      // <area href=
      myNodes = node.getElementsByName("area", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("href");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }
    }

    if (depth==0 || depth==2) {
      // <frame src=
      myNodes = node.getElementsByName("frame", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("src");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }

      // <iframe src=
      myNodes = node.getElementsByName("iframe", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("src");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }

      // <meta http-equiv="refresh" content=
      myNodes = node.getElementsByName("meta", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String equiv = myNodes[i].getAttributeByName("http-equiv");
        if ((equiv!=null) && (equiv.equalsIgnoreCase("refresh")))
        {
          String link = myNodes[i].getAttributeByName("content");
          if (link!=null && !"".equals(link))
          {
            if (link.indexOf("=")>0)
            {
              link = link.substring(link.indexOf("=")+1);
              if (!list.contains(link))
                list.add(link);
            }
          }
        }
      }

      // Look for embeded flash
      // <param name="movie" value="..."
      myNodes = node.getElementsByName("param", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String name = myNodes[i].getAttributeByName("name");
        if ("movie".equals(name))
        {
View Full Code Here

        }

        String headContentsStr = headContents.toString();
        HtmlCleaner cleaner = new HtmlCleaner();
        // parse the string HTML
        TagNode pageData = cleaner.clean(headContentsStr);

    // read in the declared namespaces
    boolean hasOGspec = false;
    TagNode headElement = pageData.findElementByName("head", true);
    if (headElement.hasAttribute("prefix"))
    {
      String namespaceData = headElement.getAttributeByName("prefix");
      Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*");
      Matcher matcher = pattern.matcher(namespaceData);
      while (matcher.find())
      {
                String prefix = matcher.group(2);
View Full Code Here

    public Collection<Node> apply(RDFBackend<Node> backend, Collection<Node>... args) throws IllegalArgumentException {
        List<Node> result = new ArrayList<Node>();

        for(Iterator<Node> it = Collections.iterator(args);it.hasNext();) {
            Node node = it.next();
            TagNode tagNode = cleaner.clean(transformer.transform(backend,node));
            try {
                result.add(backend.createLiteral(new CompactXmlSerializer(cleaner.getProperties()).getAsString(tagNode)));
            } catch (IOException e) {
                log.warn("I/O error while serializing to string",e);
            }
View Full Code Here

            it = Collections.iterator(args);
        }
        List<Node> result = new ArrayList<Node>();
        while(it.hasNext()) {
            Node node = it.next();
            TagNode tagNode = cleaner.clean(transformer.transform(backend,node));
            try {
                result.add(backend.createLiteral(new CompactXmlSerializer(cleaner.getProperties()).getAsString(tagNode)));
            } catch (IOException e) {
                log.warn("I/O error while serializing to string",e);
            }
View Full Code Here

    }

    // Try to convert html to xhtml
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties xhtmlProperties = cleaner.getProperties();
    TagNode xhtmlNode = cleaner.clean(html);
    if (xhtmlNode == null) {
      logger.warn("Error creating well-formed document from page {}", resource);
      return;
    }
View Full Code Here

TOP

Related Classes of org.htmlcleaner.TagNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.