Package org.archive.accesscontrol.robotstxt

Source Code of org.archive.accesscontrol.robotstxt.HttpRobotClient

package org.archive.accesscontrol.robotstxt;

import java.io.IOException;
import java.net.ConnectException;
import java.net.NoRouteToHostException;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.logging.Logger;

import org.apache.commons.httpclient.ConnectTimeoutException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.archive.accesscontrol.RobotsUnavailableException;

/**
* HttpRobotClient allows fetching of robots.txt rules over HTTP.
*
* @author aosborne
*
*/
public class HttpRobotClient extends RobotClient {
    private static final Logger LOGGER = Logger.getLogger(
            RobotClient.class.getName());
    protected HttpClient http = new HttpClient(
            new MultiThreadedHttpConnectionManager());
   
    public HttpClient getHttpClient() {
        return http;
    }

    public RobotRules getRulesForUrl(String url, String userAgent) throws IOException, RobotsUnavailableException {
        String robotsUrl = robotsUrlForUrl(url);
        HttpMethod method = new GetMethod(robotsUrl);
        method.addRequestHeader("User-Agent", userAgent);
        try {
            int code = http.executeMethod(method);
            // TODO: Constant 200
            if (code != 200) {
                throw new RobotsUnavailableException(robotsUrl);
            }
        } catch (HttpException e) {
            e.printStackTrace();
            throw new RobotsUnavailableException(robotsUrl);
        } catch (UnknownHostException e) {
            LOGGER.info("Unknown host for URL " + robotsUrl);
            throw new RobotsUnavailableException(robotsUrl);
        } catch (ConnectTimeoutException e) {
            LOGGER.info("Connection Timeout for URL " + robotsUrl);
            throw new RobotsUnavailableException(robotsUrl);
        } catch (NoRouteToHostException e) {
            LOGGER.info("No route to host for URL " + robotsUrl);
            throw new RobotsUnavailableException(robotsUrl);
        } catch (ConnectException e) {
            LOGGER.info("ConnectException URL " + robotsUrl);
            throw new RobotsUnavailableException(robotsUrl);
        }
        RobotRules rules = new RobotRules();
        rules.parse(method.getResponseBodyAsStream());
        return rules;
    }

    @Override
    public void prepare(Collection<String> urls, String userAgent) {
        // no-op
    }

    @Override
    public void setRobotProxy(String host, int port) {
        http.getHostConfiguration().setProxy(host, port);
    }
}
TOP

Related Classes of org.archive.accesscontrol.robotstxt.HttpRobotClient

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.