Package org.vietspider.html

Examples of org.vietspider.html.HTMLNode


      return;
    }
  }
 
  private static List<HTMLNode> getNodes(HTMLDocument document) {
    HTMLNode root = document.getRoot();
    List<HTMLNode> values = new ArrayList<HTMLNode>();
    if(root.isNode(Name.HTML))  {
      List<HTMLNode> children = root.getChildren();
      if(children == null) return values;
      for(int i = 0; i< children.size(); i++) {
        if(children.get(i).isNode(Name.HEAD)) continue;
        if(children.get(i).isNode(Name.BODY)) {
          List<HTMLNode> bodyChildren = children.get(i).getChildren();
View Full Code Here


  }
 
 
  private static void searchScriptNode(NodeIterator iterator, List<HTMLNode> jsScripts) {
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.SCRIPT)) jsScripts.add(n);
    }
   
    /*List<HTMLNode> childen = node.getChildren();
    if (childen == null)  return ;
    for(int i = 0; i < childen.size(); i++) {
View Full Code Here

  private void searchDataNode(HTMLNode root) {*/
    NodePathParser pathParser = new NodePathParser();
    NodeIterator iterator = root.iterator();
  
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.DIV)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("id");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
     
      if("posts".equalsIgnoreCase(value)) {
        HTMLNode titleNode = searchTitleNode(root, node);
        if(titleNode != null) {
          titlePath = titleNode.getName()+"[0]";
          extractPaths.add(pathParser.toPath(titleNode).toString());
        }
       
        HTMLNode pageNode = searchPageNode(node);
        if(pageNode == null) pageNode = searchPageNode2(root);
        if(pageNode != null) {
          pagePath = "TABLE[0]";
          extractPaths.add(pathParser.toPath(pageNode).toString());
        }
       
        String threadPath = pathParser.toPath(node).toString();
       
        HTMLNode userNode = searchUserNode(node);
        if(userNode != null) {
          userPath = pathParser.toPath(userNode).toString();
         
          String path = userPath.substring(threadPath.length());
          int index = path.indexOf('[');
          if(index > -1) {
            path = path.substring(0, index+1) + "*" + path.substring(index+2, path.length());
          }
          if(titleNode != null && titleNode.isNode(Name.DIV)) {
            userPath = "DIV[1]" + path;
          } else {
            userPath = "DIV[0]" + path;
          }
          index = userPath.lastIndexOf("TD[");
          if(index > 0) {
            int end = userPath.indexOf(']', index);
            if(end > 0) {
              userPath = userPath.substring(0, index+3) + "i<2"+ userPath.substring(end);
            }
          }
        }
        if(userPath == null) return;
       
       
        HTMLNode postNode = searchContentNode(node, "post_message");
        if(postNode != null) {
          postPath = pathParser.toPath(postNode).toString();
         
          String path = postPath.substring(threadPath.length());
          int index = path.indexOf('[');
View Full Code Here

      }
    }
  }
 
  private HTMLNode searchPageNode(HTMLNode root) {
    HTMLNode parent = root.getParent();
    if(parent == null) return null;
    NodeIterator iterator = parent.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      String content  = node.getTextValue().toLowerCase();
      if(content.indexOf("page") < 0
          && content.indexOf("trang") < 0) continue;
      HTMLNode table = upParent(node, Name.TABLE);
      if(table == null || !isPageList(table)) continue;
      return table;
    }
    return null;
  }
View Full Code Here

  }
 
  private HTMLNode searchUserNode(HTMLNode root) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.A)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("class");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
      value = value.toLowerCase();
View Full Code Here

  }
 
  private HTMLNode searchContentNode(HTMLNode root, String clazz) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.DIV)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("id");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
      value = value.toLowerCase();
View Full Code Here

  }
 
  private boolean isPageList(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.A)) continue;
      Attributes attributes = n.getAttributes();
      Attribute attribute = attributes.get("href");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
      if(value.indexOf("page=") > -1) return true;
View Full Code Here

    return false;
  }
 
  private HTMLNode searchPageNode2(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    HTMLNode table = null;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.A)) continue;
      List<HTMLNode> children = n.getChildren();
      if(children == null
          || children.size() != 1
          || !children.get(0).isNode(Name.CONTENT)) continue;
      String text = children.get(0).getTextValue();
      try {
        Integer.parseInt(text.trim());
        table = upParent(n, Name.TABLE);
        break;
      } catch (Exception e) {
      }
    }
    if(table == null) return null;
    NodeHandler nodeHandler = new NodeHandler();
    iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.CONTENT)) continue;
      String text = n.getTextValue().toLowerCase().trim();
      if(text.startsWith("trang") || text.startsWith("page")) {
        if(nodeHandler.count(text) < 5) return table;
      }
    }
    return table;
View Full Code Here

    HTMLExtractor extractor  = new HTMLExtractor();
    NodePathParser pathParser = new NodePathParser();
    String title = "title";
    try {
      NodePath nodePath  = pathParser.toPath("HEAD.TITLE");
      HTMLNode titleNode = extractor.lookNode(root, nodePath);
      if(titleNode.hasChildren()) {
        title  = titleNode.getChild(0).getTextValue();
      }
    } catch (Exception e) {
      return null;
    }
   
    HTMLNode parent = node.getParent();
    if(parent == null) return null;
    NodeIterator iterator = parent.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.CONTENT)) continue;
      if(n.getParent().isNode(Name.A)) continue;
      String content = n.getTextValue();
      if(indexOf(title, content)) return upParent(n, Name.TD, Name.DIV, Name.STRONG);
    }
    return null;
  }
View Full Code Here

        String name = getAttribute(attrs, NAME_ATTR);
        if(formName.equalsIgnoreCase(name)) break;
      }
    }

    HTMLNode form = null;
    List<HTMLNode> inputs = new ArrayList<HTMLNode>();

    for(; i < tokens.size(); i++) {
      NodeImpl node = tokens.get(i);
      if(node.isNode(Name.FORM)) {
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.