Package cn.edu.hfut.dmic.webcollector.crawler

Source Code of cn.edu.hfut.dmic.webcollector.crawler.CommonCrawler

/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

package cn.edu.hfut.dmic.webcollector.crawler;


import cn.edu.hfut.dmic.webcollector.fetcher.Fetcher;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.Request;
import cn.edu.hfut.dmic.webcollector.parser.HtmlParser;
import cn.edu.hfut.dmic.webcollector.parser.Parser;
import cn.edu.hfut.dmic.webcollector.util.CommonConnectionConfig;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.ConnectionConfig;
import java.net.Proxy;
import java.net.URL;

/**
* 一种常用的广度遍历爬虫
* @author hu
*/
public abstract class CommonCrawler extends Crawler{
    private String cookie = null;
    private String useragent = "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:26.0) Gecko/20100101 Firefox/26.0";

    private boolean isContentStored = true;
    private Proxy proxy = null;
    private ConnectionConfig conconfig = null;
   
    /**
     * 根据url生成Request(http请求)的方法,可以通过Override这个方法来自定义Request
     * @param url
     * @return 实现Request接口的对象
     * @throws Exception
     */
    @Override
    public Request createRequest(String url) throws Exception {
        HttpRequest request = new HttpRequest();
        URL _URL = new URL(url);
        request.setURL(_URL);
        request.setProxy(proxy);
        request.setConnectionConfig(conconfig);
        return request;
    }

    /**
     * 根据网页的url和contentType,来创建Parser(解析器),可以通过Override这个方法来自定义Parser
     * @param url
     * @param contentType
     * @return 实现Parser接口的对象
     * @throws Exception
     */
    @Override
    public Parser createParser(String url, String contentType) throws Exception {
        if (contentType == null) {
            return null;
        }
        if (contentType.contains("text/html")) {
            return new HtmlParser(Config.topN);
        }
        return null;
    }
   
   
    @Override
    public Fetcher createFetcher() {
        Fetcher fetcher = new Fetcher();
        fetcher.setNeedUpdateDb(true);
        fetcher.setIsContentStored(isContentStored);
        conconfig = new CommonConnectionConfig(useragent, cookie);
        fetcher.setThreads(getThreads());
        return fetcher;
    }
   
    /**
     * 返回User-Agent
     * @return User-Agent
     */
    public String getUseragent() {
        return useragent;
    }

    /**
     * 设置User-Agent
     * @param useragent
     */
    public void setUseragent(String useragent) {
        this.useragent = useragent;
    }
   
    /**
     * 返回http连接配置对象
     *
     * @return http连接配置对象
     */
    public ConnectionConfig getConconfig() {
        return conconfig;
    }

    /**
     * 设置http连接配置对象
     *
     * @param conconfig http连接配置对象
     */
    public void setConconfig(ConnectionConfig conconfig) {
        this.conconfig = conconfig;
    }
   
    /**
     * 返回是否存储网页/文件的内容
     * @return 是否存储网页/文件的内容
     */
    public boolean getIsContentStored() {
        return isContentStored;
    }

    /**
     * 设置是否存储网页/文件的内容
     * @param isContentStored 是否存储网页/文件的内容
     */
    public void setIsContentStored(boolean isContentStored) {
        this.isContentStored = isContentStored;
    }
   
     /**
     * 返回代理
     * @return 代理
     */
    public Proxy getProxy() {
        return proxy;
    }

    /**
     * 设置代理
     * @param proxy 代理
     */
    public void setProxy(Proxy proxy) {
        this.proxy = proxy;
    }
   
    /**
     * 返回Cookie
     * @return Cookie
     */
    public String getCookie() {
        return cookie;
    }

    /**
     * 设置http请求的cookie
     * @param cookie Cookie
     */
    public void setCookie(String cookie) {
        this.cookie = cookie;
    }
}
TOP

Related Classes of cn.edu.hfut.dmic.webcollector.crawler.CommonCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.