Package org.vietspider.html.renderer.checker

Source Code of org.vietspider.html.renderer.checker.ULNodeChecker

/***************************************************************************
* Copyright 2001-2009 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.renderer.checker;

import java.util.ArrayList;
import java.util.List;

import org.vietspider.html.HTMLNode;
import org.vietspider.html.Name;
import org.vietspider.html.NodeIterator;
import org.vietspider.html.renderer.NodeComparator;
import org.vietspider.html.renderer.RenderNodeUtils;

/**
* Author : Nhu Dinh Thuan
*          nhudinhthuan@yahoo.com
* Feb 9, 2009 
*/
public class ULNodeChecker extends NodeChecker {
 
  public ULNodeChecker(int level) {
    super(Name.UL, level);
  }

  boolean check(CheckModel model) {
//    HTMLNode node = model.getNode();
    if(model.hasRawData()) return true;
   
   /* if(model.getTextBlockStatus() == CheckModel.UNCHECK) {
      boolean isTextBlock = contentChecker.isTextBlock(node, true, 50, 5);
      model.setTextBlockStatus( isTextBlock ? CheckModel.RIGHT : CheckModel.NOT);
    }
    if(model.hasTextBlock()) return true;*/
   
    HTMLNode node = model.getNode();
    if(linkBlockChecker.isLink(model)) {
      toContainerAncestor(model);
      return false;
    }
    if(isLinkList(node)) {
      toContainerAncestor(model);
      return false;
    }
    if(model.getTotalOfLink() > 1
        && !linkBlockChecker.hasParagraph(node, 1)) {
      toContainerAncestor(model);
      return false;
    }
    return true;
//    return RenderNodeUtils.countWord(node) > 30;
  }
 
  public boolean isLinkList(HTMLNode root) {
    List<HTMLNode> children = root.getChildren();
    if(children == null) return true;
    List<HTMLNode> list = new ArrayList<HTMLNode>();
    for(int i = 0; i < children.size(); i++) {
      if(!children.get(i).isNode(Name.LI)) continue;
      if(hasLink(children.get(i))) list.add(children.get(i));
    }
    if(list.size() < 3) return false;
    HTMLNode node = list.get(0);
   
    NodeComparator nodeComparator = new NodeComparator();
    int counter = 1;
    int onlyLinkCounter = onlyLink(node) ? 1 : 0;
    for(int i = 1; i < list.size(); i++) {
      HTMLNode n = list.get(i);
      if(onlyLink(n)) onlyLinkCounter++;
      if(!nodeComparator.compare(node, n)) continue;
      counter++;
    }
   
    int rate = (counter*100)/list.size();
    if(rate >= 85) return true;
    return (onlyLinkCounter*100)/list.size() > 85;
  }
 
  private boolean hasLink(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.A)) return true;
    }
    return false;
  }
 
  private boolean onlyLink(HTMLNode node) {
    List<HTMLNode> children = node.getChildren();
    if(children == null || children.size() != 1) return false;
    return children.get(0).isNode(Name.A);
  }
 
  private void toContainerAncestor(CheckModel model) {
    HTMLNode node = model.getNode();
    HTMLNode div = RenderNodeUtils.getAncestor(node, Name.DIV, 0, 3);
    if(div != null) {
      if(linkBlockChecker.hasParagraph(div, 1)) return;
      model.setRemoveNode(div);
      return;
    }
   
    HTMLNode tr = RenderNodeUtils.getAncestor(node, Name.TABLE, 0, 5);
    if(tr != null) {
      if(linkBlockChecker.hasParagraph(tr, 1)) return;
      model.setRemoveNode(tr);
      return;
    }
   
  }
 
  /*private boolean isLinkListBlock(HTMLNode root) {
    List<HTMLNode> children = root.getChildren();
    if(children == null) return true;
    int counter = 0;
    HTMLNode sample = null;
    for(int i = 0; i < children.size(); i++) {
      if(sample  == null) {
        sample = children.get(i);
      } else {
        if(!compare(sample, children.get(i))) return false;
      }
      NodeIterator iterator = children.get(i).iterator();
      boolean hasLink = false;
      while(iterator.hasNext()) {
        HTMLNode n = iterator.next();
        if(!n.isNode(Name.A)) continue;
        hasLink = true;
        counter++;
      }
      if(!hasLink) return false;
    }
    if(children.size() == 0) return false;
    return (counter*100)/children.size() > 75;
  }
 
  private boolean compare(HTMLNode node1, HTMLNode node2) {
    if(node1.getName() != node2.getName()) return false;
    List<HTMLNode> children1 = node1.getChildren();
    List<HTMLNode> children2 = node2.getChildren();
    if(children1 == null) {
      if(children2 == null) return true;
      return false;
    } else if(children2 == null) {
      return false;
    }
    if(children1.size() != children2.size()) return false;
    for(int i = 0;  i < children1.size(); i++) {
      if(!compare(children1.get(i), children2.get(i))) return false;
    }
    return true;
  }*/

}
 
TOP

Related Classes of org.vietspider.html.renderer.checker.ULNodeChecker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.