Package org.vietspider.html.util

Source Code of org.vietspider.html.util.HTMLAnchorUtil

/***************************************************************************
* Copyright 2001-2007 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.util;

import java.util.ArrayList;
import java.util.List;

import org.vietspider.common.io.LogService;
import org.vietspider.html.HTMLDocument;
import org.vietspider.html.HTMLNode;
import org.vietspider.html.Name;
import org.vietspider.html.NodeIterator;
import org.vietspider.html.parser.NodeImpl;
import org.vietspider.html.path2.HTMLExtractor;
import org.vietspider.html.path2.NodePathParser;
import org.vietspider.token.TypeToken;
import org.vietspider.token.attribute.Attribute;
import org.vietspider.token.attribute.Attributes;

/**
* Author : Nhu Dinh Thuan nhudinhthuan@yahoo.com Oct 12, 2007
*/
public class HTMLAnchorUtil extends HTMLNodePath {

  public final void searchAnchors(HTMLNode node, List<HTMLNode> anchors) {  
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if (n.isNode(Name.A)) {
        Attributes attrs = n.getAttributes();
        Attribute attr = attrs.get("name");
        if (attr != null) anchors.add(n);
      }
    }
  }

  public HTMLDocument searchDocument(HTMLDocument document, String name) {
    HTMLNode root = document.getRoot();
    HTMLNode newRoot = searchDocument(root, name);
    if (root == newRoot) return document;
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor = new HTMLExtractor();
    try {
      HTMLNode body = extractor.lookNode(root, pathParser.toPath("BODY"));
      body.clearChildren();
      body.addChild(newRoot);
    } catch (Exception e) {
      LogService.getInstance().setThrowable(e);
    }
    return document;
  }

  public HTMLNode searchDocument(HTMLNode root, String name) {
    List<HTMLNode> anchors = new ArrayList<HTMLNode>();
    searchAnchors(root, anchors);

    int i = 0;
    for (; i < anchors.size(); i++) {
      Attributes attrs = anchors.get(i).getAttributes();
      Attribute attr = attrs.get("name");
      if (attr.getValue().equals(name)) break;
    }

    if (i < anchors.size() - 1) {
      return processNextNode(root, anchors, i);
    } else if (i > 0) {
      return processPrevNode(root, anchors, i);
    }
    return root;
  }

  private HTMLNode processNextNode(HTMLNode root, List<HTMLNode> anchors, int i) {
    HTMLNode node = anchors.get(i);
    HTMLNode nextNode = anchors.get(i + 1);
    String path = getIndexPath(node);
    String nextPath = getIndexPath(nextNode);
    String commonPath = getCommonIndexPath(path, nextPath);
    try {
      HTMLNode commonNode = getNodeByIndex(root, commonPath);
      path = path.substring(commonPath.length());
      path = path.substring(1, path.indexOf('.', 1));
      int idx = Integer.parseInt(path);

      nextPath = nextPath.substring(commonPath.length());
      nextPath = nextPath.substring(1, nextPath.indexOf('.', 1));
      int nextIdx = Integer.parseInt(nextPath);

      HTMLNode newCommonNode =
        new NodeImpl(commonNode.getValue(), commonNode.getName(), TypeToken.TAG);
      List<HTMLNode> children = commonNode.getChildren();
      for (i = idx; i < nextIdx; i++) {
        newCommonNode.addChild(children.get(i));
      }
      return newCommonNode;
    } catch (Exception e) {
      // LogService.getInstance().setMessage(e.toString());
      return root;
    }
  }

  private HTMLNode processPrevNode(HTMLNode root, List<HTMLNode> anchors, int i) {
    if (i < 0 || i >= anchors.size())
      return root;
    HTMLNode node = anchors.get(i);
    HTMLNode prevNode = anchors.get(i - 1);
    String path = getIndexPath(node);
    String prevPath = getIndexPath(prevNode);
    String commonPath = getCommonIndexPath(path, prevPath);
    try {
      HTMLNode commonNode = getNodeByIndex(root, commonPath);
      path = path.substring(commonPath.length());
      int idx = path.indexOf('.', 1);
      // path = path.substring(0, idx);
      if (idx > 0) {
        path = path.substring(1, idx);
      } else {
        if (path.charAt(0) == '.')
          path = path.substring(1);
      }

      // if(idx > 1) path = path.substring(2, idx);
      // if(path.charAt(0) == '.') path = path.substring(1);
      // if(path.trim().isEmpty()) return root;
      idx = Integer.parseInt(path);

      HTMLNode newCommonNode = new NodeImpl(commonNode.getValue(), commonNode.getName(),
          TypeToken.TAG);
      List<HTMLNode> children = commonNode.getChildren();
      for (i = idx; i < children.size(); i++) {
        newCommonNode.addChild(children.get(i));
        // children.get(i).setParent(newCommonNode);
      }
      return newCommonNode;
    } catch (Exception e) {
      StringBuilder builder = new StringBuilder();
      builder.append("path value: ").append(path).append('\n');
      builder.append("commonpath: ").append(commonPath).append('\n');
      LogService.getInstance().setMessage(e, path + "<=" + getIndexPath(node));
      return root;
    }
  }

}
TOP

Related Classes of org.vietspider.html.util.HTMLAnchorUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.