Package org.vietspider.html.parser

Examples of org.vietspider.html.parser.HTMLParser2


    saveData(name);
  }

  public void saveData(String name) throws Exception {
    if(data == null || data.length < 1) return;
    HTMLDocument doc = new HTMLParser2().createDocument(data, "utf-8");
    if(this.addPaths == null) return;
    doc = htmlExtractor.extract(doc, this.addPaths);
    if(removePaths != null && removePaths.length > 0){
      htmlExtractor.remove(doc.getRoot(), removePaths);
    }
View Full Code Here


    if(!childrenThread[0].isComplete()) return ;
    if(childrenThread == null) return;
    if(childrenThread[0] == null) return;
    byte [] data = childrenThread[0].getData();
    if(data == null || data.length < 0) return;
    HTMLDocument document =  new HTMLParser2().createDocument(data, "utf-8");
    if(homePath != null) {
      document = htmlExtractor.extract(document, new NodePath[]{homePath});
    }
    linkUtil.createFullNormalLink(document.getRoot(), new URL(url));
    links  = linkUtil.scanSiteLink(document.getRoot());
View Full Code Here

    charset = ch;   
  }

  public void documentBrowserCompleted(String content, String address){
    try{
      document = new HTMLParser2().createDocument(content);
      removeIFrameSource(document.getRoot());
      //      try {
      //        URL home = new URL(address);
      //        Map<String,String> map = new HashMap<String,String>();
      //        map.put("iframe", "src");
View Full Code Here

*/
public class JavascriptExtractor {

  public static void main(String[] args) throws Exception {
    URL url = new URL("http://java.sun.com/");
    HTMLDocument document = new HTMLParser2().createDocument(url.openStream(), "utf-8");
   
    NodeIterator iterator =  document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(node.isNode(Name.SCRIPT)) {
View Full Code Here

          }
        }
      }
      if(builder.length() < 1) continue;
//      System.out.println(builder.toString());
      replace(jsScripts.get(i), new HTMLParser2().createDocument(builder.toString()));
    }
  }
View Full Code Here

    builder.append("<h4>Địa chỉ gửi link liên kết đến bài viết này</h4>");
   
    List<String> jsScripts = new ArrayList<String>();
    jsScripts.add("show_postcontent");
   
    HTMLDocument document = new HTMLParser2().createDocument(builder.toString());
    JsHandler.updateDocument(document, jsScripts);
    System.out.println(document.getTextValue());
  }
View Full Code Here

    return post(referer, formName, url, chars, abort);
  }
 
  public int post(String referer,
      String formName, String url, char[] chars, boolean abort) throws Exception {
    List<NodeImpl>  tokens  = new HTMLParser2().createTokens(chars);
    if(tokens == null) {
      message = "Error: Can't parse tokens!";
      return ERROR;
    }
View Full Code Here

    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    return post(referer, url, chars);
  }
 
  public int post(String referer, String url, char[] chars) throws Exception {
    List<NodeImpl>  tokens  = new HTMLParser2().createTokens(chars);
    if(tokens == null) {
      message = "Can't parse tokens!";
      return ERROR;
    }
View Full Code Here

  void setAfterLogin(List<NodeImpl> tokens) throws Exception {
    afters = build(tokens);
  }
 
  private String [] build(List<NodeImpl> tokens) throws Exception {
    HTMLDocument doc = new HTMLParser2().createDocument(tokens);
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor  = new HTMLExtractor();
   
    NodePath nodePath  = pathParser.toPath("BODY");
    HTMLNode body = extractor.lookNode(doc.getRoot(), nodePath);
View Full Code Here

    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
   
    byte  [] bytes = download(webClient, "http://news.google.com.vn/");
   
    HTMLDocument document = new HTMLParser2().createDocument(bytes, null);
   
    String [] paths = {
        "BODY[0].TABLE[2].TBODY[0].TR[0].TD[3].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[*]"
    };
   
View Full Code Here

TOP

Related Classes of org.vietspider.html.parser.HTMLParser2

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.