Package com.jeck.microblogging.utils

Source Code of com.jeck.microblogging.utils.HtmlUtils

package com.jeck.microblogging.utils;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

import weibo4j.http.HttpClient;
import weibo4j.http.Response;
import weibo4j.model.WeiboException;

public class HtmlUtils {
  private String html;
  private URL url;
  private TagNode root;
  private HtmlCleaner cleaner;

  public HtmlUtils(String html) {
    this.html = html;

    // create an instance of HtmlCleaner
    cleaner = new HtmlCleaner();
    // take default cleaner properties
    CleanerProperties props = cleaner.getProperties();

    // customize cleaner's behaviour with property setters
    // props.setXXX(...);
    props.setOmitXmlDeclaration(true);
    props.setOmitHtmlEnvelope(true);

    // Clean HTML taken from simple string, file, URL, input stream,
    // input source or reader. Result is root node of created
    // tree-like structure. Single cleaner instance may be safely used
    // multiple times.
    root = cleaner.clean(html);
  }
 
  public HtmlUtils(URL url) throws IOException {
    this.url = url;

    // create an instance of HtmlCleaner
    cleaner = new HtmlCleaner();
    // take default cleaner properties
    CleanerProperties props = cleaner.getProperties();

    // customize cleaner's behaviour with property setters
    // props.setXXX(...);
    props.setOmitXmlDeclaration(true);
    props.setOmitHtmlEnvelope(true);

    // Clean HTML taken from simple string, file, URL, input stream,
    // input source or reader. Result is root node of created
    // tree-like structure. Single cleaner instance may be safely used
    // multiple times.
    root = cleaner.clean(url);
  }

  public List<String> readByXPath(String xPath) throws XPatherException {
    Object[] nodes = root.evaluateXPath(xPath);
    List<String> rsList = new ArrayList<String>();
    if (null != nodes) {
      for (Object object : nodes) {
        TagNode node = (TagNode) object;
        rsList.add(node.getText().toString());
      }
    }
    return rsList;
  }
 
  public List<String> readByAttrValue(String attrName,String value) throws XPatherException {
    Object[] nodes = root.getElementsByAttValue(attrName, value, true, true);
    List<String> rsList = new ArrayList<String>();
    if (null != nodes) {
      for (Object object : nodes) {
        TagNode node = (TagNode) object;
        rsList.add(node.getText().toString());
      }
    }
    return rsList;
  }

  public static void main(String[] arg) throws Exception {
//    HttpClient client = new HttpClient();
//    client.setToken("test");
//    Response response = client.get("http://www.baidu.com");
    List<String> rs = new HtmlUtils(new URL("http://www.haha.mx/new"))
        .readByAttrValue("class", "item");
    if (rs != null) {
      for (String string : rs) {
        System.out.println("===="+string);
      }
    }
  }
}
TOP

Related Classes of com.jeck.microblogging.utils.HtmlUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.