Package org.vietspider.html

Examples of org.vietspider.html.HTMLNode


    StringBuilder builder = new StringBuilder();
   
    List<HTMLNode> links = new ArrayList<HTMLNode>();
   
    for(int i = 0; i < nodes.size(); i++) {
      HTMLNode node = nodes.get(i);
  
      if(isLinkContent(linkNodeChecker, node, 0)) {
        links.add(node);
        if(isValidNode(node)) continue;
      }
     
      if(builder.length() > 0) builder.append(' ');
      builder.append(node.getValue());
    }
   
    if(links.size() < 1) {
      return null;
    }
View Full Code Here


    List<HTMLNode> children = node.getChildren();
    if(children == null
        || children.size() < 1) return new String[0];
    String [] values = new String[children.size()];
    for(int i = 0; i < children.size(); i++) {
      HTMLNode child = children.get(i);
      StringBuilder builder = new StringBuilder();
      builder.append(child.getName().toString());
      builder.append('[').append(getIndex(children, child)).append(']');
      values[i] = builder.toString();
    }
    return values;
  }
View Full Code Here

    }
    return false;
  }
 
  public HTMLNode searchUpper(HTMLNode node, Name...names) {
    HTMLNode parent  = node.getParent();
    if(parent == null) return null;
    for(Name name  : names) {
      if(parent.isNode(name)) return parent;
    }
//    if(parent.isNode(Name.TABLE)
//        || parent.isNode(Name.DIV)
//        || parent.isNode(Name.CENTER)) {
//      return parent;
View Full Code Here

  private boolean isWrapperContent(HTMLNode node){
    List<HTMLNode> children = node.getChildren();
    if(children == null) return false;
    for(int i = 0; i < children.size(); i++) {
      HTMLNode child = children.get(i);
      if(child.isNode(Name.CONTENT)
          || isWrapperContent(child)) return true;
    }
    return false;
  }
View Full Code Here

//      System.out.println(" tra ve do max");
      return null;
    }
    node = values.get(max);
    while(true) {
      HTMLNode newNode = search(node);
      if(newNode == null) return node;
      node = newNode;
    }
   
////    if(node.isNode(Name.TABLE)) {
View Full Code Here

      score += new ScoreCalculator().calculate(pattern, sentence, word);
    }
   
//    System.out.println("================================================================");
    for(int i = 0;  i < nodes.size(); i++) {
      HTMLNode n = nodes.get(i);
      if(n.isNode(Name.OBJECT))  {
        score += 1000;
      } else if(n.isNode(Name.IMG))  {
//        System.out.println(new String(n.getValue()));
        Attributes attributes = n.getAttributes();
        score += calculateFromAttr(attributes.get("width"));
        score += calculateFromAttr(attributes.get("height"));
      }
    }
   
View Full Code Here

 
  protected ContentChecker contentChecker = new ContentChecker();
 
  public void removeNode(HTMLNode node) {
    if(node == null) return;
    HTMLNode parent = node.getParent();
    if(parent == null || !parent.hasChildren()) return ;
    parent.removeChild(node);
//    if(parent.getChildren().size() < 1) removeNode(parent);
  }
View Full Code Here

//  System.out.println(" chon cai thu " + index + " : "+ renderer.getScore()+ " : ");
    return renderer;
  }

  public HTMLNode extractContent(HTMLDocument document, String url, boolean clean) throws Exception  {
    HTMLNode body = ContentRendererFactory.searchBody(document);
//    PageExtractor pageExtractor = new PageExtractor();
//    pageExtractor.filter(body, NodeChecker.createDefaultCheckers());
    ContentRenderer renderer = ContentRendererFactory.createContentRenderer(body, url);

    NodeRenderer nodeRenderer = extractContent(renderer);
    if(nodeRenderer == null) return null;

    List<HTMLNode> nodes = nodeRenderer.getContents();
//  for(int i = 0; i< nodes.size(); i++) {
//    System.out.println(nodes.get(i).getName() + " : ");
//  }
    HTMLNode  value = parentUtil.getUpParent(nodes);

//  ScoreCalculator.printNode(nodeRenderer);
//  System.out.println("thay co cai na " + nodeRenderer.getScore() );

//  java.io.File file  = new java.io.File("F:\\Temp2\\web\\output\\a.txt");
View Full Code Here

  private int countLink(List<HTMLNode> validNodes, HTMLNode node) {
    if(node == null) return 0;
    NodeIterator iterator = node.iterator();
    int counter = 0;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(validNodes.contains(n)) continue;
      if(n.isNode(Name.A)) counter++;
    }
//  System.out.println(" thay co cai na "+ counter);
    return counter;
  }
View Full Code Here

      char [] chars = node.getValue();
      if(!isEmpty(chars)) {
        for(int k = 0; k < chars.length; k++) {
          builder.append(chars[k] == '\n' ? ' ' : chars[k]);
        }
        HTMLNode parent = node.getParent();
        if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
      }
      break;
    case IMG:
      Attributes attributes = node.getAttributes();
      int value = calculateFromAttr(attributes.get("width"), 200);
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.