Package

Source Code of SingleCrawlThread

import java.io.File;
import java.io.FileOutputStream;

import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.vietspider.chars.URLUtils;
import org.vietspider.html.HTMLDocument;
import org.vietspider.html.parser.HTMLParser2;
import org.vietspider.html.path2.HTMLExtractor;
import org.vietspider.html.path2.NodePath;
import org.vietspider.net.client.HttpResponseReader;
import org.vietspider.net.client.WebClient;

/**
*  Author : Nhu Dinh Thuan
*          Email:nhudinhthuan@yahoo.com
* May 15, 2006
*/
public class SingleCrawlThread extends Thread {
 
  private String url;
  private String referer;
 
  private NodePath [] addPaths;
  private NodePath [] removePaths;

  private int index = 0;

  private byte[] data;
 
  private HTMLExtractor htmlExtractor;
 
  private  URLUtils urlUtils = new URLUtils();
  private WebClient webClient;

  public SingleCrawlThread(WebClient webClient, HTMLExtractor extractor){
    this.webClient = webClient;
    this.htmlExtractor = extractor;
    
    new Thread(this).start();
  }
 
  public void setPaths(NodePath[] addPaths, NodePath[] removePaths) {
    this.addPaths = addPaths;
    this.removePaths = removePaths;
  }

  public void run(){
    while(true){
      try{
        if(url != null){
          System.out.println("start download "+url);
          data = download();
          url = null;
        }
        Thread.sleep(1000);
      }catch(Exception exp){
        exp.printStackTrace();
      }
    }
  }

  public boolean isComplete(){
    return url == null;
  }

  public void startDownload(String referer_, String url_, int idx){
    this.url = url_;
    this.referer = referer_;
    this.index = idx;
  }
 
  public byte[] getData() { return data; }

  public void saveData() throws Exception {
    String name = String.valueOf(index)+".htm";
    saveData(name);
  }

  public void saveData(String name) throws Exception {
    if(data == null || data.length < 1) return;
    HTMLDocument doc = new HTMLParser2().createDocument(data, "utf-8");
    if(this.addPaths == null) return;
    doc = htmlExtractor.extract(doc, this.addPaths);
    if(removePaths != null && removePaths.length > 0){
      htmlExtractor.remove(doc.getRoot(), removePaths);
    }
    data = doc.getTextValue().getBytes("utf-8");
    if(data.length < 1) return;
    File file = new File(name);
    FileOutputStream output = new FileOutputStream(file);
    output.write(data);
    output.flush();
    output.close();
    data = null;
  }

  private byte[] download() throws Exception {
    HttpGet httpGet = null;
    try {
      String address = urlUtils.getCanonical(url);
      httpGet = webClient.createGetMethod(address, referer);     

      if(httpGet == null) return null;
      HttpHost httpHost = webClient.createHttpHost(address);
      HttpResponse httpResponse = webClient.execute(httpHost, httpGet);

      HttpResponseReader httpResponseReader = new HttpResponseReader();
      return httpResponseReader.readBody(httpResponse);
    } catch(Exception exp) {
      throw exp;
    }
  }


}
TOP

Related Classes of SingleCrawlThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.