Examples of org.htmlcleaner.TagNode

org.htmlcleaner.TagNode

XML node tag - basic node of the cleaned HTML tree. At the same time, it represents start tag token after HTML parsing phase and before cleaning phase. After cleaning process, tree structure remains containing tag nodes (TagNode class), content (text nodes - ContentNode), comments (CommentNode) and optionally doctype node (DoctypeToken).

  public static final String DEFAULT_HTML_INPUT_ENCODING = "Windows-1251";
  
  public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,  XPathExpressionException {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    CleanerProperties props = htmlCleaner.getProperties();
    TagNode node = htmlCleaner.clean(hhcFile);
    Document hhcDocument = new DomSerializer(props).createDOM(node);
    XPath xpath = XPathFactory.newInstance().newXPath();
    Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
        .getDocumentElement(), XPathConstants.NODE);
    List<TOCReference> sections = processUlNode(ulNode, resources);

View Full Code Here

        for (int ii = 0; ii < timesToRun; ++ii) {
          if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
            text = (String)currField[ii];            
          }//TESTED
          
          TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));
          
          //NewCode : Only use html cleaner for cleansing
          //use JAXP for full Xpath lib
          Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);

View Full Code Here

    else if (scriptLang.equalsIgnoreCase("xpath")) {
      
      try {
        createHtmlCleanerIfNeeded();


        TagNode node = cleaner.clean(new ByteArrayInputStream(field.getBytes()));


        Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
        XPath xpa = XPathFactory.newInstance().newXPath();        
        
        NodeList res = (NodeList)xpa.evaluate(script, doc, XPathConstants.NODESET);

View Full Code Here

        
        // Create reader so the input can be read in UTF-8
        Reader rawContentReader = new InputStreamReader(new ByteArrayInputStream(rawContent), FilterUtils.getNullSafe(metadata.get(Metadata.ORIGINAL_CHAR_ENCODING), defaultEncoding));
        
        // Use the cleaner to "clean" the HTML and return it as a TagNode object
        TagNode tagNode = cleaner.clean(rawContentReader);
        cleanedXmlHtml = domSerializer.createDOM(tagNode);
      } else if(content.getContentType().contains(new StringBuilder("/xml")) || content.getContentType().contains(new StringBuilder("+xml"))) {
        
        // Parse as xml - don't clean
        cleanedXmlHtml = documentBuilder.parse(new InputSource(new ByteArrayInputStream(rawContent)));

View Full Code Here

      if (statusCode != 200) {
        throw new RuntimeException("Failed to get page: " + statusCode);
      }
      String response = method.getResponseBodyAsString();
      HtmlCleaner html = new HtmlCleaner();
      TagNode content = html.clean(response).findElementByAttValue("id", "content", true, false);
      Object[] rows = content.evaluateXPath("table/tbody/tr[@id]");
      for (Object row: rows) {
        if (!(row instanceof TagNode)) {
          continue;
        }
        TagNode rowNode = (TagNode) row;
        try {
          String s;
          Script script = new Script();
          String id = rowNode.getAttributeByName("id").replace("scripts-", "");        
          TagNode nameNode = rowNode.getChildTags()[0].getChildTags()[0];
          String scriptUrl = nameNode.getAttributeByName("href");
          String description = getText(rowNode, "td[1]/p");
          s = getText(rowNode, "td[2]/a");
          int reviews = s == null ? 0 : Integer.parseInt(s.replaceAll("\\D", ""));
          s = getText(rowNode, "td[2]/span/span[@class='number']");
          int averageReview = s == null ? 0 : (int) (Float.parseFloat(s) * 1000);
          int posts = Integer.parseInt(rowNode.getChildTags()[2].getText().toString());
          int fans = Integer.parseInt(rowNode.getChildTags()[3].getText().toString());
          int installs = Integer.parseInt(rowNode.getChildTags()[4].getText().toString());
          TagNode updatedNode = rowNode.getChildTags()[5];
          String dateString = updatedNode.getChildTags()[0].getAttributeByName("title").replace("Z", "-0000");


          script.id = id;
          script.url = scriptUrl;
          script.updated = sdf.parse(dateString).getTime();          
          script.installs = installs;

View Full Code Here

  
  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectScript(java.lang.String)
   */
  public void injectScript(String script) {
    TagNode js = new TagNode(SCRIPT_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(SRC_ATTRIBUTE, script);
    headNode.addChild(js);
  }

View Full Code Here


  /* (non-Javadoc)
   * @see org.apache.wookie.util.html.IHtmlProcessor#injectStylesheet(java.lang.String)
   */
  public void injectStylesheet(String stylesheet) {
    TagNode js = new TagNode(LINK_TAG);
    js.addAttribute(TYPE_ATTRIBUTE, CSS_TYPE_ATTRIBUTE_VALUE);
    js.addAttribute(REL_ATTRIBUTE, CSS_REL_ATTRIBUTE_VALUE);
    js.addAttribute(HREF_ATTRIBUTE, stylesheet);
    headNode.addChild(js);
  }

View Full Code Here

    
    //
    // Check if the page already has a META http-equiv=content-type tag,
    // if it doesn't create one and add it to the head node
    //
    TagNode meta = headNode.findElementByAttValue("http-equiv", "content-type", true, false);
    if (meta == null) {
      meta = new TagNode(META_TAG);
      meta.addAttribute("http-equiv", "Content-Type");
      headNode.getChildren().add(0, meta);
    }
    //
    // Force UTF into lowercase
    //
    if (charset.equals("UTF-8")) charset = "utf-8";
    
    //
    // Override the charset and content-type values for the 
    // META http-equiv=content-type tag
    //
    meta.addAttribute("content", type + ";charset=" + charset);
  }

View Full Code Here

      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
 
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
        }
      }

View Full Code Here

  
  private String parse(String content){
    StringWriter writer = new StringWriter();
    HtmlSerializer ser = new HtmlSerializer(properties);
    try {
      TagNode html = cleaner.clean(content);
      ser.writeXml(html, writer, "UTF-8");
      return writer.getBuffer().toString();
    } catch (IOException e) {
      return null;
    }

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.htmlcleaner.TagNode

at.newmedialab.ldpath.model.functions.CleanHtmlFunction

ch.entwine.weblounge.preview.xhtmlrenderer.XhtmlRendererPagePreviewGenerator

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilter

com.cubusmail.mail.text.MessageTextUtil

com.cubusmail.mail.text.test.HtmlParserTest

com.cubusmail.server.mail.text.MessageTextUtil

com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

com.jeck.microblogging.utils.HtmlUtils

com.netfever.site.dynovisz.tools.utils.XmlUtils

com.skrul.greasefire.DownloadScripts

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.