Package org.htmlcleaner

Examples of org.htmlcleaner.TagNode


    node.serialize(serializer, writer);
    return writer.toString();
  }
 
  private static Set<Node> condense(String document) throws IOException {
    TagNode root = parse(document);
    Collector collector = Traverser.traverse(root, new Collector());
    return new HashSet<Node>(collector.getNodes());
  }
View Full Code Here


  }
 
  private static String getError(String response) throws IOException{
    String error = null;
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode html = cleaner.clean(response);
    TagNode errortag = html.findElementByAttValue("id", "error", true, true);
    if (errortag != null){
      error = errortag.getAttributeByName("title");
    }
    return error;
  }
View Full Code Here

 
  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectScript(java.lang.String)
   */
  public void injectScript(String script) {
    TagNode js = new TagNode(SCRIPT_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(SRC_ATTRIBUTE, script);
    headNode.addChild(js);
  }
View Full Code Here

  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectStylesheet(java.lang.String)
   */
  public void injectStylesheet(String stylesheet) {
    TagNode js = new TagNode(LINK_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, CSS_TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(REL_ATTRIBUTE, CSS_REL_ATTRIBUTE_VALUE);
    js.addAttribute(HREF_ATTRIBUTE, stylesheet);
    headNode.addChild(js);
  }
View Full Code Here

          if (escapeXml && !specialCase) {
            content = escapeXml(content, props, true);
          }
          element.appendChild( specialCase ? document.createCDATASection(content) : document.createTextNode(content) );
        } else if (item instanceof TagNode) {
          TagNode subTagNode = (TagNode) item;
          Element subelement = document.createElement( subTagNode.getName() );;
         
          setAttributes(subTagNode, subelement);
         
          // recursively create subnodes
          createSubnodes(document, subelement, subTagNode.getChildren());
         
          element.appendChild(subelement);
        } else if (item instanceof List) {
          List sublist = (List) item;
          createSubnodes(document, element, sublist);
View Full Code Here

      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
        }
      }
View Full Code Here

    @Override
    public String select(String text) {
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
View Full Code Here

    @Override
    public List<String> selectList(String text) {
        List<String> results = new ArrayList<String>();
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
View Full Code Here

    @Test
    public void parserPerformanceTest() throws XPatherException {
        System.out.println(html.length());

        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode tagNode = htmlCleaner.clean(html);
        Document document = Jsoup.parse(html);

        long time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            tagNode.evaluateXPath("//a");
        }
        System.out.println(System.currentTimeMillis()-time);

        System.out.println("=============");

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            Jsoup.parse(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            document.select("a");
        }
        System.out.println(System.currentTimeMillis()-time);

        System.out.println("=============");

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            tagNode.evaluateXPath("//a");
        }
        System.out.println(System.currentTimeMillis()-time);

        System.out.println("=============");
View Full Code Here

  }

  public byte[] processHtml(Resource resource, Book book, String outputEncoding) throws IOException {
   
    // clean html
    TagNode node = htmlCleaner.clean(resource.getReader());

    // post-process cleaned html
    node.setAttribute("xmlns", Constants.NAMESPACE_XHTML);
    node.setDocType(createXHTMLDoctypeToken());
   
    // write result to output
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    Writer writer = new OutputStreamWriter(out, outputEncoding);
    writer = new NoCloseWriter(writer);
View Full Code Here

TOP

Related Classes of org.htmlcleaner.TagNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.