Package org.htmlcleaner

Examples of org.htmlcleaner.TagNode


  public static final String DEFAULT_HTML_INPUT_ENCODING = "Windows-1251";
 
  public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,  XPathExpressionException {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    CleanerProperties props = htmlCleaner.getProperties();
    TagNode node = htmlCleaner.clean(hhcFile);
    Document hhcDocument = new DomSerializer(props).createDOM(node);
    XPath xpath = XPathFactory.newInstance().newXPath();
    Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
        .getDocumentElement(), XPathConstants.NODE);
    List<TOCReference> sections = processUlNode(ulNode, resources);
View Full Code Here


        for (int ii = 0; ii < timesToRun; ++ii) {
          if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
            text = (String)currField[ii];           
          }//TESTED
         
          TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));
         
          //NewCode : Only use html cleaner for cleansing
          //use JAXP for full Xpath lib
          Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
         
View Full Code Here

    else if (scriptLang.equalsIgnoreCase("xpath")) {
     
      try {
        createHtmlCleanerIfNeeded();

        TagNode node = cleaner.clean(new ByteArrayInputStream(field.getBytes()));

        Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
        XPath xpa = XPathFactory.newInstance().newXPath();       
       
        NodeList res = (NodeList)xpa.evaluate(script, doc, XPathConstants.NODESET);
View Full Code Here

       
        // Create reader so the input can be read in UTF-8
        Reader rawContentReader = new InputStreamReader(new ByteArrayInputStream(rawContent), FilterUtils.getNullSafe(metadata.get(Metadata.ORIGINAL_CHAR_ENCODING), defaultEncoding));
       
        // Use the cleaner to "clean" the HTML and return it as a TagNode object
        TagNode tagNode = cleaner.clean(rawContentReader);
        cleanedXmlHtml = domSerializer.createDOM(tagNode);
      } else if(content.getContentType().contains(new StringBuilder("/xml")) || content.getContentType().contains(new StringBuilder("+xml"))) {
       
        // Parse as xml - don't clean
        cleanedXmlHtml = documentBuilder.parse(new InputSource(new ByteArrayInputStream(rawContent)))
View Full Code Here

      if (statusCode != 200) {
        throw new RuntimeException("Failed to get page: " + statusCode);
      }
      String response = method.getResponseBodyAsString();
      HtmlCleaner html = new HtmlCleaner();
      TagNode content = html.clean(response).findElementByAttValue("id", "content", true, false);
      Object[] rows = content.evaluateXPath("table/tbody/tr[@id]");
      for (Object row: rows) {
        if (!(row instanceof TagNode)) {
          continue;
        }
        TagNode rowNode = (TagNode) row;
        try {
          String s;
          Script script = new Script();
          String id = rowNode.getAttributeByName("id").replace("scripts-", "");       
          TagNode nameNode = rowNode.getChildTags()[0].getChildTags()[0];
          String scriptUrl = nameNode.getAttributeByName("href");
          String description = getText(rowNode, "td[1]/p");
          s = getText(rowNode, "td[2]/a");
          int reviews = s == null ? 0 : Integer.parseInt(s.replaceAll("\\D", ""));
          s = getText(rowNode, "td[2]/span/span[@class='number']");
          int averageReview = s == null ? 0 : (int) (Float.parseFloat(s) * 1000);
          int posts = Integer.parseInt(rowNode.getChildTags()[2].getText().toString());
          int fans = Integer.parseInt(rowNode.getChildTags()[3].getText().toString());
          int installs = Integer.parseInt(rowNode.getChildTags()[4].getText().toString());
          TagNode updatedNode = rowNode.getChildTags()[5];
          String dateString = updatedNode.getChildTags()[0].getAttributeByName("title").replace("Z", "-0000");

          script.id = id;
          script.url = scriptUrl;
          script.updated = sdf.parse(dateString).getTime();         
          script.installs = installs;
View Full Code Here

 
  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectScript(java.lang.String)
   */
  public void injectScript(String script) {
    TagNode js = new TagNode(SCRIPT_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(SRC_ATTRIBUTE, script);
    headNode.addChild(js);
  }
View Full Code Here

  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectStylesheet(java.lang.String)
   */
  public void injectStylesheet(String stylesheet) {
    TagNode js = new TagNode(LINK_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, CSS_TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(REL_ATTRIBUTE, CSS_REL_ATTRIBUTE_VALUE);
    js.addAttribute(HREF_ATTRIBUTE, stylesheet);
    headNode.addChild(js);
  }
View Full Code Here

   
    //
    // Check if the page already has a META http-equiv=content-type tag,
    // if it doesn't create one and add it to the head node
    //
    TagNode meta = headNode.findElementByAttValue("http-equiv", "content-type", true, false);
    if (meta == null) {
      meta = new TagNode(META_TAG);
      meta.addAttribute("http-equiv", "Content-Type");
      headNode.getChildren().add(0, meta);
    }
    //
    // Force UTF into lowercase
    //
    if (charset.equals("UTF-8")) charset = "utf-8";
   
    //
    // Override the charset and content-type values for the
    // META http-equiv=content-type tag
    //
    meta.addAttribute("content", type + ";charset=" + charset);
  }
View Full Code Here

      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
        }
      }
View Full Code Here

 
  private String parse(String content){
    StringWriter writer = new StringWriter();
    HtmlSerializer ser = new HtmlSerializer(properties);
    try {
      TagNode html = cleaner.clean(content);
      ser.writeXml(html, writer, "UTF-8");
      return writer.getBuffer().toString();
    } catch (IOException e) {
      return null;
    }
View Full Code Here

TOP

Related Classes of org.htmlcleaner.TagNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.