Source Code of com.ontometrics.scraper.ListingDetailScraperTest

package com.ontometrics.scraper;


import static com.ontometrics.scraper.HtmlSample.PagedListingFolder;
import static com.ontometrics.scraper.HtmlSample.PagedListingTable;
import static com.ontometrics.scraper.extraction.HtmlExtractor.html;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.is;


import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import java.util.ListIterator;


import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.ontometrics.scraper.extraction.DefaultFieldExtractor;
import com.ontometrics.scraper.extraction.HtmlExtractor;
import com.ontometrics.scraper.extraction.Link;
import com.ontometrics.scraper.extraction.LinkExtractor;


public class ListingDetailScraperTest {


  private static final Logger log = LoggerFactory.getLogger(ListingDetailScraperTest.class);


//  private static final String listingTableKeyword = "Opportunity Title";
//  private static final String linkPattern = "mode=VIEW";


  @Test
  public void canExtractLinksFromListingPage() {
    List<Link> foundLinks = new LinkExtractor().source(html().url(PagedListingTable.getUrl())).getLinks();


    log.info("found {} links: {}", foundLinks.size(), foundLinks);
    assertThat(foundLinks.size(), is(greaterThan(0)));
  }


  @Test
    @Ignore
  public void canExtractLinksFromMultiplePagesThenFollowToDetailsPage() throws MalformedURLException {
    Iterator pageIterator = new Iterator() {


      List<Link> foundLinks = new LinkExtractor().source(html().url(PagedListingTable.getUrl())).getLinks();
      ListIterator<Link> foundLinksIterator = foundLinks.listIterator();


      @Override
      public URL next() {
        Link currentLink = (Link) foundLinksIterator.next();
        log.debug("current iterating page = {}", currentLink.getHref());


        String CurrentURLStr = PagedListingFolder.getPath() + "/" + currentLink.getHref();
        URL currentURL = TestUtil.getFileAsURL(CurrentURLStr);


        return currentURL;
      }


      @Override
      public boolean hasNext() {


        try {
          Link nextLink = foundLinks.get(foundLinksIterator.nextIndex());
          log.debug("next page to iterate = {}", nextLink.getHref());
        } catch (Exception e) {
          log.debug("Page iterator can't peek next link");
        }


        return foundLinksIterator.hasNext();
      }


      @Override
      public URL getBaseUrl() {
        // TODO Auto-generated method stub
        return PagedListingFolder.getUrl();
      }
    };
    HtmlExtractor htmlExtractor = html().url(PagedListingTable.getUrl());


    List<Record> records = new ListingDetailScraper().setConvertURLs(true)
        .iterator(pageIterator)
        .listing(new LinkExtractor().source(htmlExtractor))
        .details(new DefaultFieldExtractor().source(htmlExtractor))
        .getRecords();


    assertThat(records.size(), greaterThan(0));


    log.debug("fields = {}", records);


  }


}
Source Code of com.ontometrics.scraper.ListingDetailScraperTest

Related Classes of com.ontometrics.scraper.ListingDetailScraperTest