package org.archive.accesscontrol.robotstxt;
import java.io.IOException;
import java.net.ConnectException;
import java.net.NoRouteToHostException;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.logging.Logger;
import org.apache.commons.httpclient.ConnectTimeoutException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.archive.accesscontrol.RobotsUnavailableException;
/**
* HttpRobotClient allows fetching of robots.txt rules over HTTP.
*
* @author aosborne
*
*/
public class HttpRobotClient extends RobotClient {
private static final Logger LOGGER = Logger.getLogger(
RobotClient.class.getName());
protected HttpClient http = new HttpClient(
new MultiThreadedHttpConnectionManager());
public HttpClient getHttpClient() {
return http;
}
public RobotRules getRulesForUrl(String url, String userAgent) throws IOException, RobotsUnavailableException {
String robotsUrl = robotsUrlForUrl(url);
HttpMethod method = new GetMethod(robotsUrl);
method.addRequestHeader("User-Agent", userAgent);
try {
int code = http.executeMethod(method);
// TODO: Constant 200
if (code != 200) {
throw new RobotsUnavailableException(robotsUrl);
}
} catch (HttpException e) {
e.printStackTrace();
throw new RobotsUnavailableException(robotsUrl);
} catch (UnknownHostException e) {
LOGGER.info("Unknown host for URL " + robotsUrl);
throw new RobotsUnavailableException(robotsUrl);
} catch (ConnectTimeoutException e) {
LOGGER.info("Connection Timeout for URL " + robotsUrl);
throw new RobotsUnavailableException(robotsUrl);
} catch (NoRouteToHostException e) {
LOGGER.info("No route to host for URL " + robotsUrl);
throw new RobotsUnavailableException(robotsUrl);
} catch (ConnectException e) {
LOGGER.info("ConnectException URL " + robotsUrl);
throw new RobotsUnavailableException(robotsUrl);
}
RobotRules rules = new RobotRules();
rules.parse(method.getResponseBodyAsStream());
return rules;
}
@Override
public void prepare(Collection<String> urls, String userAgent) {
// no-op
}
@Override
public void setRobotProxy(String host, int port) {
http.getHostConfiguration().setProxy(host, port);
}
}