@Test
@Ignore
public void canExtractLinksFromMultiplePagesThenFollowToDetailsPage() throws MalformedURLException {
Iterator pageIterator = new Iterator() {
List<Link> foundLinks = new LinkExtractor().source(html().url(PagedListingTable.getUrl())).getLinks();
ListIterator<Link> foundLinksIterator = foundLinks.listIterator();
@Override
public URL next() {
Link currentLink = (Link) foundLinksIterator.next();
log.debug("current iterating page = {}", currentLink.getHref());
String CurrentURLStr = PagedListingFolder.getPath() + "/" + currentLink.getHref();
URL currentURL = TestUtil.getFileAsURL(CurrentURLStr);
return currentURL;
}
@Override
public boolean hasNext() {
try {
Link nextLink = foundLinks.get(foundLinksIterator.nextIndex());
log.debug("next page to iterate = {}", nextLink.getHref());
} catch (Exception e) {
log.debug("Page iterator can't peek next link");
}
return foundLinksIterator.hasNext();
}
@Override
public URL getBaseUrl() {
// TODO Auto-generated method stub
return PagedListingFolder.getUrl();
}
};
HtmlExtractor htmlExtractor = html().url(PagedListingTable.getUrl());
List<Record> records = new ListingDetailScraper().setConvertURLs(true)
.iterator(pageIterator)
.listing(new LinkExtractor().source(htmlExtractor))
.details(new DefaultFieldExtractor().source(htmlExtractor))
.getRecords();
assertThat(records.size(), greaterThan(0));