Package spiderman.plugin.impl

Source Code of spiderman.plugin.impl.FetchPointImpl

package spiderman.plugin.impl;

import org.eweb4j.spiderman.fetcher.FetchRequest;
import org.eweb4j.spiderman.fetcher.FetchResult;
import org.eweb4j.spiderman.plugin.FetchPoint;
import org.eweb4j.spiderman.spider.SpiderListener;
import org.eweb4j.spiderman.task.Task;
import org.eweb4j.spiderman.xml.Site;
import org.eweb4j.util.CommonUtil;

import spiderman.plugin.util.PageFetcherImpl;
import spiderman.plugin.util.SpiderConfig;

/**
* 一个Host一个FetchPointImpl对象
* @author weiwei l.weiwei@163.com
* @date 2013-1-7 下午06:40:05
*/
public class FetchPointImpl implements FetchPoint{

  private SpiderListener listener = null;
  private Site site = null;
 
  public void init(Site site, SpiderListener listener) {
    this.site = site;
    this.listener = listener;
  }

  public void destroy() {
  }
 
  public static void main(String[] args){
    PageFetcherImpl fetcher = new PageFetcherImpl();
    SpiderConfig config = new SpiderConfig();
    config.setCharset("utf-8");
    config.setPolitenessDelay(200);
    fetcher.setConfig(config);
    fetcher.init(null);
    try {
      String url = "http://www.livingsocial.com/cities/1964-klang-valley-kuala-lumpur/deals/638602-patong-bay-resotel-return-flight?append_ref_code=source_cities_show";
      FetchRequest req = new FetchRequest();
      req.setUrl(url);
      FetchResult rs = fetcher.fetch(req);
      System.out.println(rs);
      System.out.println(rs.getPage().getContent());
    } catch (Exception e) {
      e.printStackTrace();
    }
//    String json = "{\"id\":12,\"name\":\"weiwei\"}";
//    Map<String, Object> map = CommonUtil.parse(json, Map.class);
//    System.out.println(map);
  }
 
  public FetchResult fetch(Task task, FetchResult result) throws Exception {
    synchronized (site) {
      if (site.fetcher == null){
        PageFetcherImpl fetcher = new PageFetcherImpl();
        SpiderConfig config = new SpiderConfig();
        if (task.site.getCharset() != null && task.site.getCharset().trim().length() > 0)
          config.setCharset(task.site.getCharset());
        if (task.site.getUserAgent() != null && task.site.getUserAgent().trim().length() > 0)
          config.setUserAgentString(task.site.getUserAgent());
        if (task.site.getIncludeHttps() != null && task.site.getIncludeHttps().trim().length() > 0)
          config.setIncludeHttpsPages("1".equals(task.site.getIncludeHttps()) || "true".equals(task.site.getIncludeHttps()));
        String sdelay = task.site.getReqDelay();
        if (sdelay == null || sdelay.trim().length() == 0)
          sdelay = "200";
       
        int delay = CommonUtil.toSeconds(sdelay).intValue()*1000;
        if (delay < 0)
          delay = 200;
       
        config.setPolitenessDelay(delay);
        fetcher.setConfig(config);
       
        fetcher.init(site);
        site.fetcher = fetcher;
      }
     
      String url = task.url.replace(" ", "%20");
     
      FetchRequest req = new FetchRequest();
      req.setUrl(url);
     
      FetchResult fr = site.fetcher.fetch(req);
      return fr;
    }
//    return fetch();
  }
 
//  private FetchResult fetch(){
//    FetchResult fetchResult = new FetchResult();
//    CrawlerConfiguration config = new CrawlerConfiguration(task.url);
//   
//    listener.onInfo(Thread.currentThread(), "crawling url: " + task.url);
//
//    Url urlToCrawl = new Url(config.beginUrl(), 0);
//        Page page = config.downloader().get(urlToCrawl.link());
//        if (page.getStatusCode() != Status.OK) {
//          listener.onError(Thread.currentThread(), "errorUrl->" + urlToCrawl.link(), new Exception(page.getStatusCode().name() + " link->" + urlToCrawl.link()));
//        } else {
//          org.eweb4j.spiderman.fetcher.Page _page = new org.eweb4j.spiderman.fetcher.Page();
//      _page.setContent(page.getContent());
//      _page.setContentType("text/html");
//      _page.setContentData(page.getContent().getBytes());
//      _page.setCharset(page.getCharset());
//      _page.setUrl(page.getUrl());
//      fetchResult.setPage(_page);
//      fetchResult.setFetchedUrl(page.getUrl());
//      fetchResult.setStatusCode(page.getStatusCode().ordinal());
//        }
//
//        for (String l : page.getLinks()) {
//            String link = config.normalizer().normalize(l);
//            final Url url = new Url(link, urlToCrawl.depth() + 1);
//            //是否进入递归抓取,如果进入递归就需要控制深度
//        }
//       
//        return fetchResult;
//  }

}
TOP

Related Classes of spiderman.plugin.impl.FetchPointImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.