Package org.vietspider.html

Examples of org.vietspider.html.HTMLNode


      }
    }
  }

  public HTMLDocument searchDocument(HTMLDocument document, String name) {
    HTMLNode root = document.getRoot();
    HTMLNode newRoot = searchDocument(root, name);
    if (root == newRoot) return document;
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor = new HTMLExtractor();
    try {
      HTMLNode body = extractor.lookNode(root, pathParser.toPath("BODY"));
      body.clearChildren();
      body.addChild(newRoot);
    } catch (Exception e) {
      LogService.getInstance().setThrowable(e);
    }
    return document;
  }
View Full Code Here


    }
    return root;
  }

  private HTMLNode processNextNode(HTMLNode root, List<HTMLNode> anchors, int i) {
    HTMLNode node = anchors.get(i);
    HTMLNode nextNode = anchors.get(i + 1);
    String path = getIndexPath(node);
    String nextPath = getIndexPath(nextNode);
    String commonPath = getCommonIndexPath(path, nextPath);
    try {
      HTMLNode commonNode = getNodeByIndex(root, commonPath);
      path = path.substring(commonPath.length());
      path = path.substring(1, path.indexOf('.', 1));
      int idx = Integer.parseInt(path);

      nextPath = nextPath.substring(commonPath.length());
      nextPath = nextPath.substring(1, nextPath.indexOf('.', 1));
      int nextIdx = Integer.parseInt(nextPath);

      HTMLNode newCommonNode =
        new NodeImpl(commonNode.getValue(), commonNode.getName(), TypeToken.TAG);
      List<HTMLNode> children = commonNode.getChildren();
      for (i = idx; i < nextIdx; i++) {
        newCommonNode.addChild(children.get(i));
      }
      return newCommonNode;
    } catch (Exception e) {
      // LogService.getInstance().setMessage(e.toString());
      return root;
View Full Code Here

  }

  private HTMLNode processPrevNode(HTMLNode root, List<HTMLNode> anchors, int i) {
    if (i < 0 || i >= anchors.size())
      return root;
    HTMLNode node = anchors.get(i);
    HTMLNode prevNode = anchors.get(i - 1);
    String path = getIndexPath(node);
    String prevPath = getIndexPath(prevNode);
    String commonPath = getCommonIndexPath(path, prevPath);
    try {
      HTMLNode commonNode = getNodeByIndex(root, commonPath);
      path = path.substring(commonPath.length());
      int idx = path.indexOf('.', 1);
      // path = path.substring(0, idx);
      if (idx > 0) {
        path = path.substring(1, idx);
      } else {
        if (path.charAt(0) == '.')
          path = path.substring(1);
      }

      // if(idx > 1) path = path.substring(2, idx);
      // if(path.charAt(0) == '.') path = path.substring(1);
      // if(path.trim().isEmpty()) return root;
      idx = Integer.parseInt(path);

      HTMLNode newCommonNode = new NodeImpl(commonNode.getValue(), commonNode.getName(),
          TypeToken.TAG);
      List<HTMLNode> children = commonNode.getChildren();
      for (i = idx; i < children.size(); i++) {
        newCommonNode.addChild(children.get(i));
        // children.get(i).setParent(newCommonNode);
      }
      return newCommonNode;
    } catch (Exception e) {
      StringBuilder builder = new StringBuilder();
View Full Code Here

 
  public void handle(HTMLNode node) {
    if(node == null) return;
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
     
      handleNode(n);
     
      if(!n.isTag()) continue;
      Attribute attr = getAttribute(n);
      if(attr == null) continue;
      String attrValue = attr.getValue();
      if(attrValue == null) continue;
      if (verifier == null || verifier.verify(attrValue)) list.add(attrValue);
View Full Code Here

* Oct 12, 2007 
*/
public abstract class HTMLNodePath {
 
  protected String getIndexPath(HTMLNode element){
    HTMLNode parent = element.getParent();
    HTMLNode child = element;
    StringBuilder path = new StringBuilder();
    while(parent != null){
      if(path.length() > 0) path.insert(0, '.');     
      path.insert(0, parent.getChildren().indexOf(child));
      child = parent;
View Full Code Here

  public HTMLNode getNodeByIndex(HTMLNode node, String indexPath) throws Exception {  
    if(indexPath == null || indexPath.trim().length() < 1) {
      throw new NullPointerException("path is empty or null");
    }
    String [] split = indexPath.split("\\.");
    HTMLNode ele = node; 
    for(String element : split) {
      ele = node.getChildren().get(Integer.parseInt(element));
      if(ele.getChildren() == null) break;
      node = ele;    
    }
    return ele;
 
View Full Code Here

    handler = new IdentifierAttributeHandler(list, jsVerifier, "a", "href");
    handler.handle(root);
   
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.SCRIPT)) continue;
      if(n.getChildren().size() < 1) continue;
      list.add(n.getChild(0).getTextValue());
    }
    return list;
  }
View Full Code Here

  public synchronized void createFullLink(HTMLNode node,
      Map<String, String> map, URL home, ValueVerifier verifier) {
    if(node == null) return;
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isTag()) createFullSingleLink(n, map, home, verifier);
    }
  }
View Full Code Here

 
  public void createFullLink(HTMLNode node,
      String nodeName, String attrName, URL home, ValueVerifier verifier) {
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isTag()) createFullSingleLink(n, nodeName, attrName, home, verifier);
    }
  }
View Full Code Here

*/
public class HTMLParentUtils {
 
  public HTMLNode getUpParent(List<HTMLNode> nodes) {
    if(nodes.size() < 1) return null;
    HTMLNode parent  = nodes.get(0);
    while(parent != null) {
      if(isChild(parent, nodes, 1)) {
        return parent;
      }
      parent = parent.getParent();
    }
    return null;
  }
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.