Package

Source Code of CrawlerService

import java.net.URL;
import java.util.List;

import org.vietspider.html.HTMLDocument;
import org.vietspider.html.parser.HTMLParser2;
import org.vietspider.html.path2.HTMLExtractor;
import org.vietspider.html.path2.NodePath;
import org.vietspider.html.path2.NodePathParser;
import org.vietspider.html.util.HyperLinkUtil;
import org.vietspider.net.client.WebClient;

/**
*  Author : Nhu Dinh Thuan
*          Email:nhudinhthuan@yahoo.com
* May 15, 2006
*/
public class CrawlerService extends Thread {

  private SingleCrawlThread [] childrenThread;

  private boolean complete = true;

  private String url;
  private NodePath homePath ;
  private NodePath  [] addPaths, removePaths ;

  private List<String> links;

  private int idx = 0;

  private HyperLinkUtil linkUtil = new HyperLinkUtil();
  private NodePathParser pathParser = new NodePathParser();
  private HTMLExtractor htmlExtractor = new HTMLExtractor();
  private WebClient webClient = new WebClient();

  public CrawlerService(){
    childrenThread = new SingleCrawlThread [3];
    for(int i=0; i<childrenThread.length; i++){
      childrenThread[i] = new SingleCrawlThread(webClient, htmlExtractor);
    }
    new Thread(this).start();
  }

  public void run(){
    while(true){
      try {
        processHome();
        processLink();
        Thread.sleep(1000);
      }catch(Exception exp){
        exp.printStackTrace();
      }
    }
  }

  private void processHome() throws Exception {
    if(links != null && links.size() 0) return;
    if(!childrenThread[0].isComplete()) return ;
    if(childrenThread == null) return;
    if(childrenThread[0] == null) return;
    byte [] data = childrenThread[0].getData();
    if(data == null || data.length < 0) return;
    HTMLDocument document =  new HTMLParser2().createDocument(data, "utf-8");
    if(homePath != null) {
      document = htmlExtractor.extract(document, new NodePath[]{homePath});
    }
    linkUtil.createFullNormalLink(document.getRoot(), new URL(url));
    links  = linkUtil.scanSiteLink(document.getRoot());
    idx = 0;
  }

  private void processLink() throws Exception{
    if(links == null || links.size() 1) return;
    if(idx >= links.size()) return;
    for(int i=0; i < childrenThread.length; i++){
      if(!childrenThread[i].isComplete()) continue;
      childrenThread[i].saveData();
      if(idx >= links.size()){
        System.out.println("download completed !");
        return;
      }          
      childrenThread[i].startDownload(url, links.get(idx), idx);
      idx++;
    }
  }
 
  public void startCrawl(String url_, String homePathValue, String [] addPathValues, String [] removePathValues) throws Exception {
    this.url = url_;
    this.webClient.setURL(null, new URL(url));
    if(homePathValue != null) {
      this.homePath = pathParser.toPath(homePathValue);
    }
   
    this.addPaths = new NodePath[addPathValues.length];
   
    for(int i=0 ;i<addPathValues.length; i++){
      this.addPaths[i] = pathParser.toPath(addPathValues[i]);
    }
   
    if(removePathValues != null){
      this.removePaths = new NodePath[removePathValues.length];
      for(int i=0 ;i<removePathValues.length; i++){
        this.removePaths[i] = pathParser.toPath(removePathValues[i]);
      }
    }
   
    for(int i = 0; i < childrenThread.length; i++){
      childrenThread[i].setPaths(addPaths, removePaths);
    }
    childrenThread[0].startDownload(null, url_, 0);
  }

  public boolean isComplete() { return complete; }

}
TOP

Related Classes of CrawlerService

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.