package net.sf.jpluck.spider;
import net.sf.jpluck.ClientConfiguration;
import net.sf.jpluck.http.HttpClient;
import net.sf.jpluck.http.HttpResponse;
import net.sf.jpluck.http.TimeoutException;
import java.net.URI;
/**
* HTTP retriever that uses the java.net classes.
*/
class HttpRetriever extends Retriever {
private String referrer;
HttpRetriever(Spider spider, String uri, int level, boolean embedded, String referrer) {
super(spider, uri, level, embedded);
this.referrer=referrer;
}
public void run() {
if (!spider.isRunning()) {
return;
}
HttpClient httpClient = new HttpClient(ClientConfiguration.getDefault().getDownloadAttempts(),
spider.httpTimeout * 1000, spider.jxlDocument.getUserAgent(),
ClientConfiguration.getDefault().getAcceptCharset());
httpClient.setReferrer(referrer);
try {
spider.fireRetrievalStarted(uri);
HttpResponse cached = spider.retrieveFromCache(uri);
long ifModifiedSince = 0;
if (cached != null) {
ifModifiedSince = cached.getDate();
} else {
if (Spider.isOffline()) {
logger.warning("Could not retrieve " + uri + " from HTTP cache.");
return;
}
}
HttpResponse response;
if (Spider.isOffline()) {
response = cached;
logger.info(response.getStatusCode() + " " + response.getStatusMessage() + ": " + uri);
} else {
response = httpClient.doGet(uri, ifModifiedSince, spider.cookieStore);
logger.info(response.getStatusCode() + " " + response.getStatusMessage() + ": " + uri);
if (response.getStatusCode() == 304) {
response = cached;
}
}
if (!spider.isRunning()) {
return;
}
if (response.getStatusCode() == 200) {
if (response != cached && !Spider.isOffline() && spider.jxlDocument.isUseHTTPCache()) {
spider.storeInCache(uri, response);
}
String redirectionURI = null;
if (response.isRedirected()) {
redirectionURI = response.getRedirectionURL();
logger.fine(uri + ": redirected to " + redirectionURI);
if (spider.jxlDocument.getUri().equals(uri)) {
// Correct the starting URI
logger.fine("Resetting starting URI to " + redirectionURI);
spider.jxlDocument.setStartingURI(URI.create(redirectionURI));
}
}
ContentType contentType = new ContentType(response.getContentType());
if (!spider.jxlDocument.isAutoDetectInputEncoding()) {
contentType = contentType.derive(spider.jxlDocument.getInputEncoding());
}
Resource resource = new Resource(uri, redirectionURI, contentType, response.getContent(), level,
embedded);
if (spider.jxlDocument.isAutoDetectInputEncoding()) {
resource.scanForContentType();
}
spider.parse(resource);
}
} catch (TimeoutException e) {
logger.warning("Timed out: " + uri);
} catch (Exception e) {
e.printStackTrace();
} finally {
spider.fireRetrievalCompleted(uri);
spider.notifyTaskDone();
}
}
}