Package org.vietspider.html

Examples of org.vietspider.html.HTMLDocument


   
    String address = "http://forums.java.net/jive/thread.jspa?threadID=40523&tstart=0";
    byte  [] bytes = download(homepage, address);
   
    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(bytes, "utf-8");
    String titlePathValue  = "BODY[0].DIV[1].TABLE[0].TBODY[0].TR[0].TD[0].P[1]";
   
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
   
    NodePath titlePath = pathParser.toPath(titlePathValue);
    HTMLNode titleNode = htmlExtractor.lookNode(document.getRoot(), titlePath);
    String titleThread =  buildText(titleNode);
   
    String [] postPathValues = {
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].DIV[1].DIV[0].DIV[0].TABLE[0]",
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[*].TBODY[0].TR[0].TD[1].DIV[0].DIV[0].DIV[0].TABLE[0]"
    };
   
    NodePath [] postPaths = pathParser.toNodePath(postPathValues);
    HTMLDocument document2 = htmlExtractor.extract(document, postPaths);
   
    String userPathValue = "TABLE[*].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].NOBR[0].A[0]";
    NodePath userPath = pathParser.toPath(userPathValue);
    List<HTMLNode> userNodes = htmlExtractor.matchNodes(document2.getRoot(), userPath);
   
    List<String> users = new ArrayList<String>();
    for(HTMLNode userNode : userNodes) {
      users.add(buildText(userNode));
    }
   
    String textPostPathValue = "TABLE[*].TBODY[0].TR[0].TD[1].TABLE[0].TBODY[0].TR[1]";
    NodePath textPostPath = pathParser.toPath(textPostPathValue);
    List<HTMLNode> textPostNodes = htmlExtractor.matchNodes(document2.getRoot(), textPostPath);

    List<String> posts = new ArrayList<String>();
    for(HTMLNode textPostNode : textPostNodes) {
      posts.add(buildText(textPostNode));
    }
View Full Code Here


   
    String address = "http://www.hastc.org.vn/Ketqua_giaodich.asp?stocktype=2&menuid=103120";
    byte  [] bytes = download(homepage, address);
   
    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(bytes, "utf-8");
   
    String path = "BODY[0].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0]";
    path += ".TABLE[0].TBODY[0].TR[1].TD[1].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[1].TBODY[0].TR[1]";
    path += ".TD[0].TABLE[1].TBODY[0].TR[i>0]";
   
    NodePathParser pathParser = new NodePathParser();
   
    NodePath nodePath = pathParser.toPath(path);
   
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    document = htmlExtractor.extract(document, new NodePath[]{nodePath});
   
    List<HTMLNode> children = document.getRoot().getChildren();
   
    //print header
    List<String> headers = new ArrayList<String>();
    headers.add("STT");
    headers.add("MaCK");
View Full Code Here

  }

  public static void main(String[] args) {
    try{
      URL url = new URL("http://www.java.net");
      HTMLDocument document = new HTMLParser2().createDocument(url.openStream(), "utf-8");
      testGetLink(document.getRoot());
      System.out.println("\n\n\n\n*********************************************************************\n\n\n\n");
      testCreateFullLink(document.getRoot(), url);
      System.out.println("\n\n\n\n*********************************************************************\n\n\n\n");
      testCreateImageLink(document.getRoot(), url);
    }catch(Exception exp){
      exp.printStackTrace();
    }
  }
View Full Code Here

    }
  }

  public static void main(String[] argsthrows Exception {
    URL url = new URL("http://vnexpress.net/Vietnam/Xa-hoi/2006/10/3B9EFB66/");
    HTMLDocument document = new HTMLParser2().createDocument(url.openStream(), null);
    StringBuilder builder = new StringBuilder();
    build(builder, document.getRoot());
    File file = new File("E:\\Temp\\a.txt");
    if(!file.exists()) file.createNewFile();
    FileOutputStream stream = new FileOutputStream(file);
    stream.write(builder.toString().getBytes("utf-8"));
    stream.close();
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.