Package org.vietspider.html.path2

Source Code of org.vietspider.html.path2.HTMLExtractor

/***************************************************************************
* Copyright 2001-2007 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.path2;

import java.util.ArrayList;
import java.util.List;

import org.vietspider.html.HTMLDocument;
import org.vietspider.html.HTMLNode;
import org.vietspider.html.parser.HTMLParser2;
import org.vietspider.token.attribute.Attribute;
import org.vietspider.token.attribute.Attributes;

/**
* Author : Nhu Dinh Thuan
*          nhudinhthuan@yahoo.com
* Dec 6, 2007 
*/
public class HTMLExtractor {
 
  public HTMLDocument extractFirst(HTMLDocument document, NodePath[] nodePaths) {
    HTMLDocument [] documents = extractRow(document, nodePaths);
    if(documents.length < 1) return null;
    return documents[0];
  }
 
  public HTMLDocument extract(HTMLDocument document, NodePath... nodePaths) {
    HTMLNode root = document.getRoot();
//    CharsToken tokens = document.getTokens();
   
    HTMLNode newRoot = HTMLParser2.clone(root);
    HTMLDocument newDocument  = new HTMLDocument();
//    CharsToken newTokens = new CharsToken(newDocument);
//    newTokens.push((NodeImpl)newRoot);

    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> htmlNodes = matchNodes(root, nodePaths[i]);
      if(htmlNodes == null ) continue;
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
//        extractTokens(tokens, newTokens, htmlNode);
        newRoot.addChild(htmlNode);
//        htmlNode.setParent(newRoot);
      }
    }
   
    newDocument.setRoot(newRoot);
    return newDocument;
  }

  public HTMLDocument[] extractRow(HTMLDocument document, NodePath[] nodePaths) {
    List<List<HTMLNode>> listHtmlValues = new ArrayList<List<HTMLNode>>();
    HTMLNode root = document.getRoot();

    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> matchValues = matchNodes(root, nodePaths[i]);
      if(matchValues != null) listHtmlValues.add(matchValues);
    }

    if(listHtmlValues.size() == 0 || listHtmlValues.get(0) == null) return new HTMLDocument[0];

    List<HTMLNode> htmlValues = listHtmlValues.get(0);
    HTMLDocument [] newDocuments = new HTMLDocument[htmlValues.size()];

    for(int i = 0; i < htmlValues.size(); i++) {
      HTMLNode html = HTMLParser2.clone(root);
      if(htmlValues.get(i) == null) continue;
      html.addChild(htmlValues.get(i));
//      htmlValues.get(i).setParent(html);
      for(int j = 1; j < listHtmlValues.size(); j++) {
        List<HTMLNode> newHtmlValues = listHtmlValues.get(j);
        if(i > newHtmlValues.size()) break;
        try {
          if(newHtmlValues.get(i) == null) continue;
          html.addChild(newHtmlValues.get(i));
//          newHtmlValues.get(i).setParent(html);
        } catch (Exception e) {
          continue;
        }
      }
      newDocuments[i] = new HTMLDocument(html);
    }
    return newDocuments;
  }
 
  public HTMLNode lookNode(HTMLNode htmlRoot, NodePath nodePath) {
    List<HTMLNode> list = matchNodes(htmlRoot, nodePath);
    return list == null || list.size() < 1 ? null : list.get(0);
  }
 
  public List<HTMLNode> matchNodes(HTMLNode root, NodePath[] nodePaths) {
    List<HTMLNode> htmlValues = new ArrayList<HTMLNode>();
    for(NodePath nodePath : nodePaths) {
      List<HTMLNode> list = matchNodes(root, nodePath);
      if(list == null || list.size() < 1) continue;
      htmlValues.addAll(list);
    }
    return htmlValues;
  }

  public List<HTMLNode> matchNodes(HTMLNode htmlRoot, NodePath nodePath) {
    if(nodePath == null) return null;
   
    INode [] inodes = nodePath.getNodes();
    if(inodes.length < 1) return null;
    List<HTMLNode> htmlValues = new ArrayList<HTMLNode>();
    if(inodes[0] instanceof NodeExp) {
      matchNodes(htmlRoot, (NodeExp)inodes[0], htmlValues);
    } else {
      htmlValues.add(lookNode(htmlRoot, (Node)inodes[0]));
    }

    for(int i = 1; i < inodes.length; i++) {
      HTMLNode [] htmlNodes = htmlValues.toArray(new HTMLNode[htmlValues.size()]);
      htmlValues.clear();

      if(inodes[i] instanceof NodeExp) {
        NodeExp nodeExp = (NodeExp)inodes[i];
        for(HTMLNode htmlNode : htmlNodes) {
          if(htmlNode == null) continue;
          matchNodes(htmlNode, nodeExp, htmlValues);
        }
        continue;
      }
     
      Node node = (Node)inodes[i];
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
        HTMLNode test = lookNode(htmlNode, node);
        if(test == null) {
          continue;
        }
        htmlValues.add(lookNode(htmlNode, node));
      }
    }
    if(htmlValues.size() < 1 ) return null;
    return htmlValues;
  }

  public List<HTMLNode> matchNodes(HTMLNode htmlNode, NodeExp nodeExp, List<HTMLNode> htmlValues) {
    List<HTMLNode> htmlChildren = htmlNode.getChildren();
//    List<HTMLNode> htmlValues = new ArrayList<HTMLNode>(); 
//    System.out.println("node expresstion "+nodeExp.toString());
//    System.out.println("attributes length "+nodeExp.getAttributes().length);
    int counter  = 0;
    NodeMatcher matcher = new NodeMatcher();
    for(int i = 0; i < htmlChildren.size(); i++) {
      HTMLNode childNode =  htmlChildren.get(i);
      if(nodeExp.getName() != childNode.getName()) continue;
      if(matcher.match(nodeExp.getPattern(), counter)) {
        Attribute [] attrs = nodeExp.getAttributes();
        if(attrs == null || attrs.length < 1) {
//          System.out.println(" da xay ra roi ");
          htmlValues.add(childNode);
        } else {
//          System.out.println(" xay ra ");
          Attributes nodeAttributes = childNode.getAttributes();
          if(matcher.contains(nodeAttributes, attrs)) htmlValues.add(childNode);
        }
      }
      counter++;
   
    return htmlValues;
  }

  public HTMLNode lookNode(HTMLNode htmlNode, Node inode) {
    List<HTMLNode> htmlChildren = htmlNode.getChildren();

    int counter  = 0;
    for(int i = 0; i < htmlChildren.size(); i++) {
      if(inode.getName() != htmlChildren.get(i).getName()) continue;
      if(inode.getIndex() == counter) return htmlChildren.get(i);
      counter++;
   

    return null;
  }
 
  public void remove(HTMLNode root, boolean isRemoveFrom, NodePath ... removePaths) {
    if(removePaths == null || removePaths.length < 1) return;
    if(isRemoveFrom) removeFrom(root, removePaths[removePaths.length-1]);
    remove(root, removePaths);
  }

  public void remove(HTMLNode root, NodePath ... nodePaths){
    List<HTMLNode> nodes = new ArrayList<HTMLNode>();
    for(NodePath nodePath : nodePaths) {
      List<HTMLNode> matchValues = matchNodes(root, nodePath);
//      System.out.println(root.getName() + " : " + nodePaths + " : "+ matchValues);
      if(matchValues != null) nodes.addAll(matchValues);
    }

    for(HTMLNode node : nodes) {
      if(node == null) continue;
      HTMLNode parent  = node.getParent();
      if(parent == null) continue;
//      System.out.println(" truoc " + parent.getChildren().size());
      parent.removeChild(node);
//      System.out.println(" sau " + parent.getChildren().size());
    }
  }

  public void removeFrom(HTMLNode root, NodePath path){
    HTMLNode element = lookNode(root, path);
    if (element == null) return;
    java.util.Iterator<HTMLNode> iter =  element.getParent().getChildren().iterator();
    boolean remove = false;
    while(iter.hasNext()){
      HTMLNode ele = iter.next();
      if(!remove) remove = ele == element;
      if(remove) iter.remove();
    }
  }
 
/* private void extractTokens(CharsToken tokens, CharsToken newTokens, HTMLNode htmlNode) {
    HTMLNode breakNode = searchEndNode(htmlNode);
    Iterator<NodeImpl> iterator = tokens.iterator();
    boolean start = false;
    while(iterator.hasNext()) {
      NodeImpl nodeImpl = iterator.next();
      if(nodeImpl == breakNode) break;
      if(!start && nodeImpl == htmlNode) start = true;
      if(start) newTokens.push(nodeImpl);
    }
  }
 
  private HTMLNode searchEndNode(HTMLNode node) {
    HTMLNode parent  = node.getParent();
    if(parent == null) return null;
    List<HTMLNode> children = parent.getChildren();
    for(int i = 0; i < children.size(); i++) {
      if(children.get(i) != node) continue;
      if(i == children.size() - 1) return searchEndNode(parent);
      return children.get(i+1);
    }
    return null;
  }*/

}
 
TOP

Related Classes of org.vietspider.html.path2.HTMLExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.