Package org.vietspider.html.template

Source Code of org.vietspider.html.template.VBulletinForumExtractor

/***************************************************************************
* Copyright 2001-2009 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.template;

import java.util.ArrayList;
import java.util.List;

import org.vietspider.html.HTMLNode;
import org.vietspider.html.Name;
import org.vietspider.html.NodeIterator;
import org.vietspider.html.path2.HTMLExtractor;
import org.vietspider.html.path2.NodePath;
import org.vietspider.html.path2.NodePathParser;
import org.vietspider.html.util.NodeHandler;
import org.vietspider.token.attribute.Attribute;
import org.vietspider.token.attribute.Attributes;

/**
* Author : Nhu Dinh Thuan
*          nhudinhthuan@yahoo.com
* Apr 13, 2009 
*/
public class VBulletinForumExtractor {
 
  private List<String> extractPaths = new ArrayList<String>();
  private String pagePath;
  private String titlePath;
  private String userPath = null;
  private String postPath = null;
 
  public void extract(HTMLNode root) {
   /* List<HTMLNode> list = searchDataNode(root);
    NodePathParser pathParser = new NodePathParser();
    for(int i = 0; i < list.size(); i++) {
      extractPaths.add(pathParser.toPath(list.get(i)).toString());
    }
  }
 
  private void searchDataNode(HTMLNode root) {*/
    NodePathParser pathParser = new NodePathParser();
    NodeIterator iterator = root.iterator();
  
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.DIV)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("id");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
     
      if("posts".equalsIgnoreCase(value)) {
        HTMLNode titleNode = searchTitleNode(root, node);
        if(titleNode != null) {
          titlePath = titleNode.getName()+"[0]";
          extractPaths.add(pathParser.toPath(titleNode).toString());
        }
       
        HTMLNode pageNode = searchPageNode(node);
        if(pageNode == null) pageNode = searchPageNode2(root);
        if(pageNode != null) {
          pagePath = "TABLE[0]";
          extractPaths.add(pathParser.toPath(pageNode).toString());
        }
       
        String threadPath = pathParser.toPath(node).toString();
       
        HTMLNode userNode = searchUserNode(node);
        if(userNode != null) {
          userPath = pathParser.toPath(userNode).toString();
         
          String path = userPath.substring(threadPath.length());
          int index = path.indexOf('[');
          if(index > -1) {
            path = path.substring(0, index+1) + "*" + path.substring(index+2, path.length());
          }
          if(titleNode != null && titleNode.isNode(Name.DIV)) {
            userPath = "DIV[1]" + path;
          } else {
            userPath = "DIV[0]" + path;
          }
          index = userPath.lastIndexOf("TD[");
          if(index > 0) {
            int end = userPath.indexOf(']', index);
            if(end > 0) {
              userPath = userPath.substring(0, index+3) + "i<2"+ userPath.substring(end);
            }
          }
        }
        if(userPath == null) return;
       
       
        HTMLNode postNode = searchContentNode(node, "post_message");
        if(postNode != null) {
          postPath = pathParser.toPath(postNode).toString();
         
          String path = postPath.substring(threadPath.length());
          int index = path.indexOf('[');
          if(index > -1) {
            path = path.substring(0, index+1) + "*" + path.substring(index+2, path.length());
          }
         
          if(titleNode != null && titleNode.isNode(Name.DIV)) {
            postPath = "DIV[1]" + path;           
          } else {
            postPath = "DIV[0]" + path;
          }
        } else {
          int start = userPath.indexOf("TD[");
          if(start > 0) {
            int end  = userPath.indexOf(']', start);
            try {
              int indexUser = Integer.parseInt(userPath.substring(start+3, end));
              postPath = userPath.substring(0, start) +"TD[" + String.valueOf(indexUser+1)+"]";
            } catch (Exception e) {
              e.printStackTrace();
            }
          }
          
        }
       
//        HTMLNode preNode = searchPreNode(node);
//        if(preNode != null) list.add(preNode);
        extractPaths.add(threadPath);
      }
    }
  }
 
  private HTMLNode searchPageNode(HTMLNode root) {
    HTMLNode parent = root.getParent();
    if(parent == null) return null;
    NodeIterator iterator = parent.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      String content  = node.getTextValue().toLowerCase();
      if(content.indexOf("page") < 0
          && content.indexOf("trang") < 0) continue;
      HTMLNode table = upParent(node, Name.TABLE);
      if(table == null || !isPageList(table)) continue;
      return table;
    }
    return null;
  }
 
  private HTMLNode searchUserNode(HTMLNode root) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.A)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("class");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
      value = value.toLowerCase();
      if(value.indexOf("bigusername") > -1) return node;
    }
    return null;
  }
 
  private HTMLNode searchContentNode(HTMLNode root, String clazz) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.DIV)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("id");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
      value = value.toLowerCase();
      if(value.indexOf(clazz) > -1) {//"post_message"
        return upParent(node, Name.TD);
      }
    }
    return null;
  }
 
  private boolean isPageList(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.A)) continue;
      Attributes attributes = n.getAttributes();
      Attribute attribute = attributes.get("href");
      if(attribute == null) continue;
      String value = attribute.getValue();
      if(value == null) continue;
      if(value.indexOf("page=") > -1) return true;
    }
    return false;
  }
 
  private HTMLNode searchPageNode2(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    HTMLNode table = null;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.A)) continue;
      List<HTMLNode> children = n.getChildren();
      if(children == null
          || children.size() != 1
          || !children.get(0).isNode(Name.CONTENT)) continue;
      String text = children.get(0).getTextValue();
      try {
        Integer.parseInt(text.trim());
        table = upParent(n, Name.TABLE);
        break;
      } catch (Exception e) {
      }
    }
    if(table == null) return null;
    NodeHandler nodeHandler = new NodeHandler();
    iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.CONTENT)) continue;
      String text = n.getTextValue().toLowerCase().trim();
      if(text.startsWith("trang") || text.startsWith("page")) {
        if(nodeHandler.count(text) < 5) return table;
      }
    }
    return table;
  }
 
 
  private HTMLNode upParent(HTMLNode node, Name...names) {
    if(node == null) return null;
    for(int i = 0; i < names.length; i++) {
      if(node.isNode(names[i])) return node;
    }
    return upParent(node.getParent(), names);
  }
 
  private HTMLNode searchTitleNode(HTMLNode root, HTMLNode node) {
    HTMLExtractor extractor  = new HTMLExtractor();
    NodePathParser pathParser = new NodePathParser();
    String title = "title";
    try {
      NodePath nodePath  = pathParser.toPath("HEAD.TITLE");
      HTMLNode titleNode = extractor.lookNode(root, nodePath);
      if(titleNode.hasChildren()) {
        title  = titleNode.getChild(0).getTextValue();
      }
    } catch (Exception e) {
      return null;
    }
   
    HTMLNode parent = node.getParent();
    if(parent == null) return null;
    NodeIterator iterator = parent.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.CONTENT)) continue;
      if(n.getParent().isNode(Name.A)) continue;
      String content = n.getTextValue();
      if(indexOf(title, content)) return upParent(n, Name.TD, Name.DIV, Name.STRONG);
    }
    return null;
  }
 
 
 
  private boolean indexOf(String title, String content) {
    int index = 0;
    while(index < content.length()) {
      char c = content.charAt(index);
      if(Character.isLetterOrDigit(c)) break;
      index++;
    }
    if(index < content.length())  content = content.substring(index);
   
    index = content.length() - 1;
    while(index > -1) {
      char c = content.charAt(index);
      if(Character.isLetterOrDigit(c)) break;
      index--;
    }
   
    if(index > 0) content = content.substring(0, index);
   
    index = title.indexOf(content);
    return index > -1 & index < content.length();
  }

  public List<String> getExtractPaths() { return extractPaths; }

  public String getPagePath() { return pagePath; }

  public String getTitlePath() { return titlePath; }

  public String getUserPath() { return userPath; }

  public String getPostPath() { return postPath; }

}
TOP

Related Classes of org.vietspider.html.template.VBulletinForumExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.