protected void diveIn(final ScrapeContext context, final Page firstPage, final String rootUrl, final String prefix,
final Set<String> entries)
throws IOException
{
Page page = firstPage;
boolean truncated;
do {
// check for truncation (isTruncated elem, this means we need to "page" the bucket to get all entries)
truncated = isTruncated(page);
// cancelation
CancelableUtil.checkInterruption();
// response should be 200 OK, if not, give up
if (page.getHttpResponse().getStatusLine().getStatusCode() != 200) {
context.stop("Remote recognized as " + getTargetedServer()
+ ", but cannot be scraped (unexpected response status " + page.getHttpResponse().getStatusLine() + ")");
return;
}
final Elements root = page.getDocument().getElementsByTag("ListBucketResult");
if (root.size() != 1 || !root.get(0).attr("xmlns").equals("http://s3.amazonaws.com/doc/2006-03-01/")) {
context.stop("Remote recognized as " + getTargetedServer()
+ ", but unexpected response was received (not \"ListBucketResult\").");
return;
}
log.debug("Processing S3 page response from remote of {} got from URL {}", context.getProxyRepository(), page.getUrl());
String markerElement = null;
final Elements elements = page.getDocument().getElementsByTag("Contents");
for (Element element : elements) {
final Elements keyElements = element.getElementsByTag("Key");
if (keyElements.isEmpty()) {
continue; // skip it
}