Package org.htmlcleaner

Examples of org.htmlcleaner.TagNode


 
  public static Collection<String> findAllLinkHref(String html, String hostUrl) throws Exception{
    Collection<String> urls = new ArrayList<String>();
   
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(html);
    Object[] ns = node.evaluateXPath("//a[@href]");
    for (Object object : ns) {
      TagNode node2=(TagNode) object;
      String href = node2.getAttributeByName("href");
      if (href == null || href.trim().length() == 0)
        continue;
     
      if (!href.startsWith("https://") && !href.startsWith("http://")){
        StringBuilder sb = new StringBuilder("http://").append(new URL(hostUrl).getHost());
View Full Code Here


 
  private List<Map<String, Object>> parseHtml(Page page) throws Exception{
    HtmlCleaner cleaner = new HtmlCleaner();
    cleaner.getProperties().setTreatUnknownTagsAsContent(true);
    String html = page.getContent();
    TagNode rootNode = cleaner.clean(html);
    fel.getContext().set("$page_content", html);
        final List<Field> fields = target.getModel().getField();
    String isModelArray = target.getModel().getIsArray();
    String modelXpath = target.getModel().getXpath();
    List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
    if ("1".equals(isModelArray) || "tre".equals(isModelArray)){
      Object[] nodeVals = rootNode.evaluateXPath(modelXpath);
          if (nodeVals != null && nodeVals.length > 0){
            for (int i = 0; i < nodeVals.length; i++) {
          list.add(parseHtmlMap(nodeVals[i], fields));
            }
          }
View Full Code Here

        String exp = parser.getExp();
        String regex = parser.getRegex();
        String skipRgxFail = parser.getSkipRgxFail();
        try {
          if (xpath != null && xpath.trim().length() > 0) {
            TagNode tag = (TagNode)item;
            Object[] nodeVals = tag.evaluateXPath(xpath);
            if (nodeVals == null || nodeVals.length == 0)
              continue;
           
            if (attribute != null && attribute.trim().length() > 0){
              for (Object nodeVal : nodeVals){
                TagNode node = (TagNode)nodeVal;
                String attrVal = node.getAttributeByName(attribute);
                values.add(attrVal);
              }
              //正则
              parseByRegex(regex, skipRgxFail, values);
              // EXP表达式
              parseByExp(exp, values);
            }else if (xpath.endsWith("/text()")){
              for (Object nodeVal : nodeVals){
                values.add(nodeVal.toString());
              }
             
              //正则
              parseByRegex(regex, skipRgxFail, values);
             
              // EXP表达式
              parseByExp(exp, values);
            }else {
              for (Object nodeVal : nodeVals){
                TagNode node = (TagNode)nodeVal;
                values.add(node);
              }
             
              // 此种方式获取到的Node节点大部分都不是字符串,因此先执行表达式后执行正则
              // EXP表达式
View Full Code Here

 
  public static Object evalXpath(String html, String xpath, String attribute){
    List<Object> result = new ArrayList<Object>();
    HtmlCleaner cleaner = new HtmlCleaner();
    try {
      TagNode tagNode = cleaner.clean(html);
      Object[] nodeVals = tagNode.evaluateXPath(xpath);
      for (Object tag : nodeVals){
        TagNode _tag = (TagNode)tag;
        Object val = null;
        if (attribute != null)
          val = _tag.getAttributeByName(attribute);
        else if (xpath.endsWith("/text()")){
          result.add(tag.toString());
        }else
          val = tag;
       
View Full Code Here

      for( int i=0 ; i<lElements.size() ; i++ )
      {
        try
        {
          HtmlCleaner cleaner = new HtmlCleaner();
          TagNode rootNode = cleaner.clean(new URL(url));
          TagNode tagElements[] = rootNode.getElementsByName( lElements.get(i).getTag(), true );
          for( int j=0 ; j<tagElements.length ; j++ )
            if( lElements.get(i).getClas().equalsIgnoreCase("*") || tagElements[j].getAttributeByName("class").equals(lElements.get(i).getClas()) )
              lElements.get(i).add( tagElements[j].getText().toString() );
        }
        catch( Exception e )
View Full Code Here

    return result;
  }

  public void buildEditLinkUrl(int section) {
    if (fParserInput.getAllowSectionEdit()) {
      TagNode divTagNode = new TagNode("div");
      divTagNode.addAttribute("style", "font-size:90%;float:right;margin-left:5px;", false);
      divTagNode.addChild(new ContentToken("["));
      append(divTagNode);

      String url = "";
      try {
        url = LinkUtil.buildEditLinkUrl(fParserInput.getContext(), fParserInput.getVirtualWiki(), fParserInput.getTopicName(),
            null, section);
      } catch (Exception e) {
        logger.severe("Failure while building link for topic " + fParserInput.getVirtualWiki() + " / "
            + fParserInput.getTopicName(), e);
      }
      TagNode aTagNode = new TagNode("a");
      aTagNode.addAttribute("href", url, false);
      aTagNode.addChild(new ContentToken(Utilities.formatMessage("common.sectionedit", fParserInput.getLocale())));
      divTagNode.addChild(aTagNode);
      divTagNode.addChild(new ContentToken("]"));
    }
  }
View Full Code Here

      HtmlCleaner cleaner = new HtmlCleaner();
     
      CleanerProperties props = cleaner.getProperties();
      props.setUseEmptyElementTags(false);
     
      TagNode node = cleaner.clean(sb.toString());
      Document myJDom = new JDomSerializer(props, true).createJDom(node);
      XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
      sbResult.append(outputter.outputString(myJDom));
    }
    catch (IOException e) {logger.error(e);}
View Full Code Here

  public List<String> readByXPath(String xPath) throws XPatherException {
    Object[] nodes = root.evaluateXPath(xPath);
    List<String> rsList = new ArrayList<String>();
    if (null != nodes) {
      for (Object object : nodes) {
        TagNode node = (TagNode) object;
        rsList.add(node.getText().toString());
      }
    }
    return rsList;
  }
View Full Code Here

  public List<String> readByAttrValue(String attrName,String value) throws XPatherException {
    Object[] nodes = root.getElementsByAttValue(attrName, value, true, true);
    List<String> rsList = new ArrayList<String>();
    if (null != nodes) {
      for (Object object : nodes) {
        TagNode node = (TagNode) object;
        rsList.add(node.getText().toString());
      }
    }
    return rsList;
  }
View Full Code Here

    // input source or reader. Result is root node of created
    // tree-like structure. Single cleaner instance may be safely used
    // multiple times.
//    TagNode node = cleaner.clean( new File("tests/accept.html"));

    TagNode node = cleaner.clean( new File("tests/rallypoint.htm"));


    Object[] myNodes;
       


        myNodes = node.evaluateXPath( "//div[@id='content']//h1/text( )");

    String[] data = Util.patternExtract( "([^(]+)"+Translator.get(Translator.LEVEL)+" (\\d+)", myNodes[0].toString(), 2);
        String currentLocation = Translator.translate( data[0].trim());
        int    level    = Integer.parseInt( data[1].trim());
    System.out.printf( "currentLocation=%s, level=%d\n\n", currentLocation, level);

    int  numNode = 0;
        Object[] domTroops = node.evaluateXPath( "//div[@id='content']//table");
        for( Object t : domTroops) {
          if( t instanceof TagNode) {
            TagNode domTroop = (TagNode) t;
           
                Object[] domTroopTypes = domTroop.evaluateXPath( ".//tbody[@class='units']/tr[position()=1]/td/img/@title");
                Object[] domTroopQuantities = node.evaluateXPath( ".//tbody[@class='units']/tr[position()=2]/td/text()");
                Object[] domLocations = node.evaluateXPath( "//div[@id='content']//table/thead/tr/td[2]/a/@href");

              String codLocation = "<local>";
              if( numNode > 0) { // Other villages
View Full Code Here

TOP

Related Classes of org.htmlcleaner.TagNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.