* of URLs to be fetched (in a thread-safe way). It checks
* whether the URL is OK to download. If so, we do it.
*/
public void run() {
FetchListEntry fle = new FetchListEntry();
while (true) {
if (LogFormatter.hasLoggedSevere()) // something bad happened
break; // exit
String url = null;
try {
if (fetchList.next(fle) == null)
break;
url = fle.getPage().getURL().toString();
if (!fle.getFetch()) { // should we fetch this page?
if (LOG.isLoggable(Level.FINE))
LOG.fine("not fetching " + url);
handleFetch(fle, new ProtocolOutput(null, ProtocolStatus.STATUS_NOTFETCHING));
continue;
}
// support multiple redirects, if requested by protocol
// or content meta-tags (the latter requires running Fetcher
// in parsing mode). Protocol-level redirects take precedence over
// content-level redirects. Some plugins can handle redirects
// automatically, so that only the final success or failure will be
// reported here.
boolean refetch = false;
int redirCnt = 0;
do {
LOG.fine("redirCnt=" + redirCnt);
refetch = false;
LOG.info("fetching " + url); // fetch the page
Protocol protocol = ProtocolFactory.getProtocol(url);
ProtocolOutput output = protocol.getProtocolOutput(fle);
ProtocolStatus pstat = output.getStatus();
Content content = output.getContent();
switch(pstat.getCode()) {
case ProtocolStatus.SUCCESS:
if (content != null) {
synchronized (Fetcher.this) { // update status
pages++;
bytes += content.getContent().length;
if ((pages % 100) == 0) { // show status every 100pp
status();
}
}
ParseStatus ps = handleFetch(fle, output);
if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newurl = ps.getMessage();
newurl = URLFilters.filter(newurl);
if (newurl != null && !newurl.equals(url)) {
refetch = true;
url = newurl;
redirCnt++;
fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
LOG.fine(" - content redirect to " + url);
} else {
LOG.fine(" - content redirect skipped, " +
(url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
}
}
}
break;
case ProtocolStatus.MOVED: // try to redirect immediately
case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
// record the redirect. perhaps the DB will want to know this.
handleFetch(fle, output);
String newurl = pstat.getMessage();
newurl = URLFilters.filter(newurl);
if (newurl != null && !newurl.equals(url)) {
refetch = true;
url = newurl;
redirCnt++;
// create new entry.
fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
LOG.info(" - protocol redirect to " + url);
} else {
LOG.fine(" - protocol redirect skipped, " +
(url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
}