Package org.vietspider.html

Examples of org.vietspider.html.NodeIterator


    }
  }
 
  private int countWord(HTMLNode node) {
    if(node == null) return 0;
    NodeIterator nodeIterator = node.iterator();
    int word = 0;
    while(nodeIterator.hasNext()) {
      HTMLNode iterNode = nodeIterator.next();
      if(getAncestor(iterNode, Name.A, 0, 5) != null) continue;
      if(iterNode.isNode(Name.CONTENT)) {
        String text = iterNode.getTextValue();
        word += textCounter.countWord(text, 0, text.length());
      }
View Full Code Here


 
  private boolean isLinkContainer(HTMLNode node) {
    List<HTMLNode> children = node.getChildren();
    if(children == null) return false;
    if(isListNode(children)) {
      NodeIterator nodeIterator = node.iterator();
      int counter = 0;
      while(nodeIterator.hasNext()) {
        HTMLNode iterNode = nodeIterator.next();
        if(iterNode.isNode(Name.A)) counter++;
      }
      return counter >= children.size() - 3;
    }
   
View Full Code Here

 
  private boolean isLinkDiv(HTMLNode node) {
    if(node.getChildren() == null) return false;
   
    List<HTMLNode> ignores = new ArrayList<HTMLNode>();
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.A)
          && !linkNodeChecker.isValid(new CheckModel(n), 0)) ignores.add(n);
    }
   
    int counter = 0;
    iterator = node.iterator(ignores);
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.CONTENT)) {
        counter += countWord(n);
      }
    }
    return counter < 5 && ignores.size() > 1;
View Full Code Here

//        );
  }
 
  private static int countLink(HTMLNode node) {
    if(node == null) return 0;
    NodeIterator iterator = node.iterator();
    int counter = 0;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.A)) counter++;
    }
//    System.out.println(" thay co cai na "+ counter);
    return counter;
  }
View Full Code Here

    return values;
  }
 
  protected HTMLNode searchBody(HTMLDocument document) throws Exception {
    RefsDecoder decoder = new RefsDecoder();
    NodeIterator iterator = document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      char [] chars = node.getValue();
      chars = decoder.decode(chars);

      chars = CharsUtil.cutAndTrim(chars, 0, chars.length);
View Full Code Here

    this(root, null, type, false);
  }

  public TextRenderer (HTMLNode root,
      final List<HTMLNode> contents, final int type, final boolean constain) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      switch (node.getName()) {
      case CONTENT:
        char [] chars = node.getValue();
        if(!isEmpty(chars)) {
          if(isValid(contents, node, constain)) {
            int start = builder.length();
            for(int k = 0; k < chars.length; k++) {
              builder.append(chars[k] == '\n' ? ' ' : chars[k]);
            }
            HTMLNode parent = node.getParent();
            if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
           
            int end = builder.length();
            positions.add(new NodePosition(node, start, end));
          }
        }
        break;
      case IMG:
        positions.add(new NodePosition(node, -1, -1));
        break;
      case H1:
      case H2:
      case H3:
      case H4:
      case H5:
      case H6:
      case TR:
      case TABLE:
      case TD:
      case P:
      case DIV:
      case BR:
      case LI:       
        if(!isEndWithNewLine(builder)) {
          builder.append('\n');
          if(type == RENDERER) builder.append('\n');
        }
        break;
      case SCRIPT:
      case STYLE:
        NodeImpl nodeImpl = (NodeImpl) node;
        if(nodeImpl.getType() == TypeToken.TAG && iterator.hasNext()) iterator.next();
        break;
      default:
        if(builder.length() > 0) {
          char c = builder.charAt(builder.length()-1);
          if(!(Character.isWhitespace(c)
View Full Code Here

  public ContentRendererBak (HTMLNode root,
      List<HTMLNode> ignores, List<HTMLNode> wrappers, LinkChecker linkChecker) {
    this.linkChecker = linkChecker;
    StringBuilder builder = new StringBuilder();
   
    NodeIterator iterator = root.iterator(ignores);
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      switch (node.getName()) {
      case CONTENT:
        char [] chars = node.getValue();
        if(!isEmpty(chars)) {
//          if(isValid(contents, node, constain)) {
            int start = builder.length();
            for(int k = 0; k < chars.length; k++) {
              builder.append(chars[k] == '\n' ? ' ' : chars[k]);
            }
            HTMLNode parent = node.getParent();
            if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
           
            int end = builder.length();
            positions.add(new NodePosition(node, start, end));
          }
//        }
        break;
      case IMG:
//        positions.add(new NodePosition(node, -1, -1));
        break;
      case H1:
      case H2:
      case H3:
      case H4:
      case H5:
      case H6:
      case P:
      case BR:
      case LI:       
        if(!isEndWithNewLine(builder)) {
          builder.append('\n');
        }
        break;
       
      case IFRAME:
        System.out.println(builder.length());
        separateBlock(builder, 2);
//        System.out.println("====>" + builder.length());
        break;
      case DIV:
        separateBlock(builder, node, 2, wrappers);
        break;
      case TABLE:
        separateBlock(builder, node, 4, wrappers);
        break;
      case TR:
        separateBlock(builder, node, 2, wrappers);
        break;
      case TD:
        separateBlock(builder, node, 2, wrappers);
        break;
      case SCRIPT:
      case STYLE:
        NodeImpl nodeImpl = (NodeImpl) node;
        if(nodeImpl.getType() == TypeToken.TAG && iterator.hasNext()) iterator.next();
        break;
      default:
        if(builder.length() > 0) {
          char c = builder.charAt(builder.length()-1);
          if(!(Character.isWhitespace(c)
View Full Code Here

    return new ContentRenderer(body, ignores, linkNodeChecker);
  }

  public static HTMLNode searchBody(HTMLDocument document) throws Exception {
    RefsDecoder decoder = new RefsDecoder();
    NodeIterator iterator = document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      char [] chars = node.getValue();
      chars = decoder.decode(chars);

      chars = CharsUtil.cutAndTrim(chars, 0, chars.length);
View Full Code Here

    return nodeRenderer;
  }

  private int countLink(List<HTMLNode> validNodes, HTMLNode node) {
    if(node == null) return 0;
    NodeIterator iterator = node.iterator();
    int counter = 0;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(validNodes.contains(n)) continue;
      if(n.isNode(Name.A)) counter++;
    }
//  System.out.println(" thay co cai na "+ counter);
    return counter;
View Full Code Here

  }
 
  boolean isLink(CheckModel model) {
    HTMLNode node = model.getNode();
    List<HTMLNode> links = new ArrayList<HTMLNode>();
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.A)) links.add(n);
      else if(n.isNode(Name.DIV)
          || n.isNode(Name.SPAN)) {
        if(hasOnclick(n)) {
          links.add(n);
View Full Code Here

TOP

Related Classes of org.vietspider.html.NodeIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.