final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given
final String ip = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer
int pos=0;
int port=0;
DigestURI url = null;
try {
url = new DigestURI(HeaderFramework.getRequestURL(conProp));
if (log.isFine()) log.logFine(reqID +" GET "+ url);
if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader);
//redirector
if (redirectorEnabled){
synchronized(redirectorProcess){
redirectorWriter.println(url.toNormalform(false, true));
redirectorWriter.flush();
}
final String newUrl = redirectorReader.readLine();
if (!newUrl.equals("")) {
try {
url = new DigestURI(newUrl);
} catch(final MalformedURLException e){}//just keep the old one
}
if (log.isFinest()) log.logFinest(reqID +" using redirector to "+ url);
conProp.put(HeaderFramework.CONNECTION_PROP_HOST, url.getHost()+":"+url.getPort());
conProp.put(HeaderFramework.CONNECTION_PROP_PATH, url.getPath());
requestHeader.put(HeaderFramework.HOST, url.getHost()+":"+url.getPort());
requestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, url.getPath());
}
} catch (final MalformedURLException e) {
final String errorMsg = "ERROR: internal error with url generation: host=" +
host + ", port=" + port + ", path=" + path + ", args=" + args;
log.logSevere(errorMsg);
HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e);
return;
}
if ((pos = host.indexOf(':')) < 0) {
port = 80;
} else {
port = Integer.parseInt(host.substring(pos + 1));
host = host.substring(0, pos);
}
// check the blacklist
// blacklist idea inspired by [AS]:
// respond a 404 for all AGIS ("all you get is shit") servers
final String hostlow = host.toLowerCase();
if (args != null) { path = path + "?" + args; }
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, path)) {
log.logInfo("AGIS blocking of host '" + hostlow + "'");
HTTPDemon.sendRespondError(conProp,countedRespond,4,403,null,
"URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null);
return;
}
// handle outgoing cookies
handleOutgoingCookies(requestHeader, host, ip);
prepareRequestHeader(conProp, requestHeader, hostlow);
ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash());
// why are files unzipped upon arrival? why not zip all files in cache?
// This follows from the following premises
// (a) no file shall be unzip-ed more than once to prevent unnecessary computing time
// (b) old cache entries shall be comparable with refill-entries to detect/distinguish case 3+4
// (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
// case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
// and the newly arrival would be zipped and would have to be unzipped upon load. But then the
// scheduler is superfluous. Therefore the only reminding case is
// (d) cached files shall be either all zipped or unzipped
// case d contradicts with a, because files need to be unzipped for indexing. Therefore
// the only remaining case is to unzip files right upon load. Thats what we do here.
// finally use existing cache if appropriate
// here we must decide weather or not to save the data
// to a cache
// we distinguish four CACHE STATE cases:
// 1. cache fill
// 2. cache fresh - no refill
// 3. cache stale - refill - necessary
// 4. cache stale - refill - superfluous
// in two of these cases we trigger a scheduler to handle newly arrived files:
// case 1 and case 3
if (cachedResponseHeader == null) {
if (log.isFinest()) log.logFinest(reqID + " page not in cache: fulfill request from web");
fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
} else {
final Request request = new Request(
null,
url,
requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0,
0);
final Response response = new Response(
request,
requestHeader,
cachedResponseHeader,
"200 OK",
sb.crawler.defaultProxyProfile
);
byte[] cacheContent = Cache.getContent(url.hash());
if (cacheContent != null && response.isFreshForProxy()) {
if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache");
fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
} else {
if (log.isFinest()) log.logFinest(reqID + " fulfill request from web");