try {
// TODO KKr - when fetching the last item, send a Connection: close
// header to let the server know it doesn't need to keep the socket open.
Iterator<ScoredUrlDatum> iter = _items.iterator();
while (!Thread.interrupted() && iter.hasNext()) {
ScoredUrlDatum item = iter.next();
FetchedDatum result = new FetchedDatum(item);
// We use status as an extra field on the end of of FetchedDatum that lets
// us generate a full status pipe, and also a content pipe that only has
// entries which were fetched. By keying off the type (string == OK,
// BaseFetchException == bad) the FetchPipe can do this magic.
Comparable status = null;
long fetchStartTime = System.currentTimeMillis();
try {
process.increment(FetchCounters.URLS_FETCHING, 1);
result = _httpFetcher.get(item);
long deltaTime = System.currentTimeMillis() - fetchStartTime;
process.increment(FetchCounters.FETCHED_TIME, (int)deltaTime);
process.increment(FetchCounters.URLS_FETCHED, 1);
process.increment(FetchCounters.FETCHED_BYTES, result.getContentLength());
process.setStatus(Level.SLF4J_TRACE, "Fetched " + result);
status = UrlStatus.FETCHED.toString();
// TODO - check keep-alive response (if present), and close the connection/delay
// for some amount of time if we exceed this limit.
} catch (AbortedFetchException e) {
LOGGER.info("Aborted while fetching " + item.getUrl() + " due to " + e.getAbortReason());
if (e.getAbortReason() == AbortedFetchReason.INTERRUPTED) {
process.increment(FetchCounters.URLS_SKIPPED, 1);
// Make sure our loop terminates.
Thread.currentThread().interrupt();
} else {
process.increment(FetchCounters.URLS_FAILED, 1);
}
status = (Comparable)e;
} catch (BaseFetchException e) {
LOGGER.info("Fetch exception while fetching " + item.getUrl(), e);
process.increment(FetchCounters.URLS_FAILED, 1);
// We can do this because each of the concrete subclasses of BaseFetchException implements
// WritableComparable
status = (Comparable)e;
} catch (Exception e) {
LOGGER.warn("Unexpected exception while fetching " + item.getUrl(), e);
process.increment(FetchCounters.URLS_FAILED, 1);
status = new IOFetchException(item.getUrl(), new IOException(e));
} finally {
process.decrement(FetchCounters.URLS_FETCHING, 1);
Tuple tuple = result.getTuple();
tuple.add(status);
_fetchMgr.collect(tuple);
// Figure out how long it's been since the start of the request.
long fetchInterval = System.currentTimeMillis() - fetchStartTime;
// We want to avoid fetching faster than a max acceptable rate. Note that we always do
// this, even if there's not another page, so that this setting will have impact even
// if the next fetch set is ready right away.
if (fetchInterval < minPageFetchInterval) {
long delay = minPageFetchInterval - fetchInterval;
LOGGER.trace(String.format("FetchTask: sleeping for %dms", delay));
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
LOGGER.warn("FetchTask interrupted!");
Thread.currentThread().interrupt();
continue;
}
}
}
}
// While we still have entries, we need to write them out to avoid losing them.
while (iter.hasNext()) {
ScoredUrlDatum item = iter.next();
FetchedDatum result = new FetchedDatum(item);
process.increment(FetchCounters.URLS_SKIPPED, 1);
AbortedFetchException status = new AbortedFetchException(item.getUrl(), AbortedFetchReason.INTERRUPTED);
Tuple tuple = result.getTuple();
tuple.add(status);
_fetchMgr.collect(tuple);
}