Examples of org.htmlcleaner.TagNode

org.htmlcleaner.TagNode

XML node tag - basic node of the cleaned HTML tree. At the same time, it represents start tag token after HTML parsing phase and before cleaning phase. After cleaning process, tree structure remains containing tag nodes (TagNode class), content (text nodes - ContentNode), comments (CommentNode) and optionally doctype node (DoctypeToken).

  
  public static Collection<String> findAllLinkHref(String html, String hostUrl) throws Exception{
    Collection<String> urls = new ArrayList<String>();
    
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(html);
    Object[] ns = node.evaluateXPath("//a[@href]");
    for (Object object : ns) {
      TagNode node2=(TagNode) object;
      String href = node2.getAttributeByName("href");
      if (href == null || href.trim().length() == 0)
        continue;
      
      if (!href.startsWith("https://") && !href.startsWith("http://")){
        StringBuilder sb = new StringBuilder("http://").append(new URL(hostUrl).getHost());

View Full Code Here

  
  private List<Map<String, Object>> parseHtml(Page page) throws Exception{
    HtmlCleaner cleaner = new HtmlCleaner();
    cleaner.getProperties().setTreatUnknownTagsAsContent(true);
    String html = page.getContent();
    TagNode rootNode = cleaner.clean(html);
    fel.getContext().set("$page_content", html);
        final List<Field> fields = target.getModel().getField();
    String isModelArray = target.getModel().getIsArray();
    String modelXpath = target.getModel().getXpath();
    List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
    if ("1".equals(isModelArray) || "tre".equals(isModelArray)){
      Object[] nodeVals = rootNode.evaluateXPath(modelXpath);
          if (nodeVals != null && nodeVals.length > 0){
            for (int i = 0; i < nodeVals.length; i++) {
          list.add(parseHtmlMap(nodeVals[i], fields));
            }
          }

View Full Code Here

        String exp = parser.getExp();
        String regex = parser.getRegex();
        String skipRgxFail = parser.getSkipRgxFail();
        try {
          if (xpath != null && xpath.trim().length() > 0) {
            TagNode tag = (TagNode)item;
            Object[] nodeVals = tag.evaluateXPath(xpath);
            if (nodeVals == null || nodeVals.length == 0)
              continue;
            
            if (attribute != null && attribute.trim().length() > 0){
              for (Object nodeVal : nodeVals){
                TagNode node = (TagNode)nodeVal;
                String attrVal = node.getAttributeByName(attribute);
                values.add(attrVal);
              }
              //正则
              parseByRegex(regex, skipRgxFail, values);
              // EXP表达式
              parseByExp(exp, values);
            }else if (xpath.endsWith("/text()")){
              for (Object nodeVal : nodeVals){
                values.add(nodeVal.toString());
              }
              
              //正则
              parseByRegex(regex, skipRgxFail, values);
              
              // EXP表达式
              parseByExp(exp, values);
            }else {
              for (Object nodeVal : nodeVals){
                TagNode node = (TagNode)nodeVal;
                values.add(node);
              }
              
              // 此种方式获取到的Node节点大部分都不是字符串，因此先执行表达式后执行正则
              // EXP表达式

View Full Code Here

  
  public static Object evalXpath(String html, String xpath, String attribute){
    List<Object> result = new ArrayList<Object>();
    HtmlCleaner cleaner = new HtmlCleaner();
    try {
      TagNode tagNode = cleaner.clean(html);
      Object[] nodeVals = tagNode.evaluateXPath(xpath);
      for (Object tag : nodeVals){
        TagNode _tag = (TagNode)tag;
        Object val = null;
        if (attribute != null)
          val = _tag.getAttributeByName(attribute);
        else if (xpath.endsWith("/text()")){
          result.add(tag.toString());
        }else 
          val = tag;

View Full Code Here

      for( int i=0 ; i<lElements.size() ; i++ )
      {
        try
        {
          HtmlCleaner cleaner = new HtmlCleaner();
          TagNode rootNode = cleaner.clean(new URL(url));
          TagNode tagElements[] = rootNode.getElementsByName( lElements.get(i).getTag(), true );
          for( int j=0 ; j<tagElements.length ; j++ )
            if( lElements.get(i).getClas().equalsIgnoreCase("*") || tagElements[j].getAttributeByName("class").equals(lElements.get(i).getClas()) )
              lElements.get(i).add( tagElements[j].getText().toString() );
        }
        catch( Exception e )

View Full Code Here

    return result;
  }


  public void buildEditLinkUrl(int section) {
    if (fParserInput.getAllowSectionEdit()) {
      TagNode divTagNode = new TagNode("div");
      divTagNode.addAttribute("style", "font-size:90%;float:right;margin-left:5px;", false);
      divTagNode.addChild(new ContentToken("["));
      append(divTagNode);


      String url = "";
      try {
        url = LinkUtil.buildEditLinkUrl(fParserInput.getContext(), fParserInput.getVirtualWiki(), fParserInput.getTopicName(),
            null, section);
      } catch (Exception e) {
        logger.severe("Failure while building link for topic " + fParserInput.getVirtualWiki() + " / "
            + fParserInput.getTopicName(), e);
      }
      TagNode aTagNode = new TagNode("a");
      aTagNode.addAttribute("href", url, false);
      aTagNode.addChild(new ContentToken(Utilities.formatMessage("common.sectionedit", fParserInput.getLocale())));
      divTagNode.addChild(aTagNode);
      divTagNode.addChild(new ContentToken("]"));
    }
  }

View Full Code Here

      HtmlCleaner cleaner = new HtmlCleaner();
      
      CleanerProperties props = cleaner.getProperties();
      props.setUseEmptyElementTags(false);
      
      TagNode node = cleaner.clean(sb.toString());
      Document myJDom = new JDomSerializer(props, true).createJDom(node);
      XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
      sbResult.append(outputter.outputString(myJDom));
    }
    catch (IOException e) {logger.error(e);}

View Full Code Here

  public List<String> readByXPath(String xPath) throws XPatherException {
    Object[] nodes = root.evaluateXPath(xPath);
    List<String> rsList = new ArrayList<String>();
    if (null != nodes) {
      for (Object object : nodes) {
        TagNode node = (TagNode) object;
        rsList.add(node.getText().toString());
      }
    }
    return rsList;
  }

View Full Code Here

  public List<String> readByAttrValue(String attrName,String value) throws XPatherException {
    Object[] nodes = root.getElementsByAttValue(attrName, value, true, true);
    List<String> rsList = new ArrayList<String>();
    if (null != nodes) {
      for (Object object : nodes) {
        TagNode node = (TagNode) object;
        rsList.add(node.getText().toString());
      }
    }
    return rsList;
  }

View Full Code Here

    // input source or reader. Result is root node of created 
    // tree-like structure. Single cleaner instance may be safely used
    // multiple times.
//    TagNode node = cleaner.clean( new File("tests/accept.html"));


    TagNode node = cleaner.clean( new File("tests/rallypoint.htm"));




    Object[] myNodes;
        




        myNodes = node.evaluateXPath( "//div[@id='content']//h1/text( )");


    String[] data = Util.patternExtract( "([^(]+)"+Translator.get(Translator.LEVEL)+" (\\d+)", myNodes[0].toString(), 2);
        String currentLocation = Translator.translate( data[0].trim());
        int    level    = Integer.parseInt( data[1].trim());
    System.out.printf( "currentLocation=%s, level=%d\n\n", currentLocation, level);


    int  numNode = 0;
        Object[] domTroops = node.evaluateXPath( "//div[@id='content']//table");
        for( Object t : domTroops) {
          if( t instanceof TagNode) {
            TagNode domTroop = (TagNode) t;
            
                Object[] domTroopTypes = domTroop.evaluateXPath( ".//tbody[@class='units']/tr[position()=1]/td/img/@title");
                Object[] domTroopQuantities = node.evaluateXPath( ".//tbody[@class='units']/tr[position()=2]/td/text()");
                Object[] domLocations = node.evaluateXPath( "//div[@id='content']//table/thead/tr/td[2]/a/@href");


              String codLocation = "<local>";
              if( numNode > 0) { // Other villages

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.htmlcleaner.TagNode

at.newmedialab.ldpath.model.functions.CleanHtmlFunction

ch.entwine.weblounge.preview.xhtmlrenderer.XhtmlRendererPagePreviewGenerator

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilter

com.cubusmail.mail.text.MessageTextUtil

com.cubusmail.mail.text.test.HtmlParserTest

com.cubusmail.server.mail.text.MessageTextUtil

com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

com.jeck.microblogging.utils.HtmlUtils

com.netfever.site.dynovisz.tools.utils.XmlUtils

com.skrul.greasefire.DownloadScripts

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.