Source Code of com.ontometrics.scraper.legacy.ScraperTest

package com.ontometrics.scraper.legacy;


import static com.ontometrics.scraper.HtmlSample.DetailPage;
import static com.ontometrics.scraper.HtmlSample.GrantsGovTable;
import static com.ontometrics.scraper.HtmlSample.PagedListingTable;
import static com.ontometrics.scraper.HtmlSample.ProgramDetailPage;
import static com.ontometrics.scraper.HtmlSample.ProgramListingPage;
import static com.ontometrics.scraper.HtmlSample.TableWithMultipleValuesOnMultipleRows;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasItems;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;


import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.List;


import net.htmlparser.jericho.HTMLElementName;


import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.ontometrics.scraper.Iterator;
import com.ontometrics.scraper.Record;
import com.ontometrics.scraper.TestUtil;
import com.ontometrics.scraper.extraction.Field;
import com.ontometrics.scraper.util.ScraperUtil;


public class ScraperTest {


  private static final Logger log = LoggerFactory.getLogger(ScraperTest.class);


  private Scraper scraper;


  private String eligibilityCodeId = "dnf_class_values_cfda__applicant_eligibility__widget";


  private String eligibilityClassName = "fld_applicant_eligibility";


  @Before
  public void setup() {
    scraper = new Scraper();
  }


  @Test
  public void scrapeUrlReturnsHtml() throws IOException {
    String pageText = new Scraper().url(PagedListingTable.getUrl()).getResult();
    log.info("pageText = {}", pageText);
    assertThat(pageText.length(), is(greaterThan(0)));
    assertThat(pageText.toLowerCase().contains("<html>"), is(true));
  }


  @Test
  public void extractPageText() throws IOException {
    String pageContent = new Scraper().url(PagedListingTable.getUrl()).asText().getResult();
    assertThat(pageContent.contains("<html>"), is(false));
    log.info("Content: {}", pageContent);
  }


  @Test
  public void extractTableFromPage() throws Exception {
    log.info("HtmlElementName.TABLE: {}", HTMLElementName.TABLE);
    String pageText = scraper.url(ProgramListingPage.getUrl())
        .extract(scraper.extractor().table(2).execute())
        .getResult();
    log.debug("table extracted: {}", pageText);
    assertThat(pageText.startsWith("<table"), is(true));
    log.info(pageText);


  }


  @Test
  public void extractLinksFromTableOnPage() throws Exception {
    List<Field> urls = scraper.url(PagedListingTable.getUrl())
        .extract(scraper.extractor().setUseDefaultFieldExtractor(false).table(0).links().getFields())
        .getFields();


    log.info("links extracted: {}", urls);
    assertThat(urls.size(), is(greaterThan(0)));


  }


  @Test
  public void extractContentsOfElementWithId() throws Exception {
    String tagText = scraper.url(ProgramDetailPage.getUrl())
        .extract(scraper.extractor().id(eligibilityCodeId).execute()).getResult();
    log.info("tag text: {}", tagText);
    assertThat(tagText.contains("nonprofit institutions of higher education"), is(true));
  }


  @Test
  public void extractContentsByClassAndOccurrence() throws Exception {
    String tagText = scraper.url(ProgramDetailPage.getUrl())
        .extract(scraper.extractor().ofClass(eligibilityClassName, 1).execute()).getResult();
    log.info("tag text: {}", tagText);
    assertThat(tagText.contains("39"), is(true));
    assertThat(tagText.contains("52"), is(true));
    
    
    //This returns all occurrances not the first one only.
    tagText = scraper.url(ProgramDetailPage.getUrl())
        .extract(scraper.extractor().ofClass(eligibilityClassName, 0).execute()).getResult();
    log.info("tag text: {}", tagText);
    assertThat(tagText.contains("Applicant Eligibility (081)"), is(true));
    assertThat(tagText.contains("39"), is(true));
  }


  @Test
  public void extractParameterFromLinksInTable() throws Exception {
    Scraper scraper = new Scraper();
    List<String> ids = scraper.url(GrantsGovTable.getUrl())
        .extractStrings(scraper.extractor().table(3).links().parameter("oppId").getResults()).getResults();


    assertThat(ids.size(), is(greaterThan(0)));
    log.info("ids found: {}", ids);
    assertThat(ids, hasItems("40034", "40158", "40095", "40790", "40821", "40849", "40315"));
    assertThat(ids, hasItems("40967", "41255", "41282", "40458", "41599", "41734", "40667", "41771"));
    assertThat(ids, hasItems("41898", "41032", "41896", "42394", "42445"));
  }


  @Test
  public void extractLinksFromTableContainingString() throws Exception {
    Scraper scraper = new Scraper();
    String table = scraper.url(DetailPage.getUrl()).extract(scraper.extractor().table("Document Type").execute())
        .getResult();


    log.info("table matching {} : {}", "Document Type", table);
    assertThat(table.toString().contains("Document Type"), is(true));


  }


  @Test
  public void extractParameterFromLinksInIteratedTables() throws Exception {
    Scraper scraper = new Scraper();
    Iterator pageIterator = new Iterator() {
      private int currentPage = 2;


      @Override
      public URL getBaseUrl() {
        return null;
      }


      @Override
      public URL next() {
        String nextPageUrl = MessageFormat.format("/testpages/ids-page-{0}.html", currentPage++);
        log.debug("next page to iterate to: {}", nextPageUrl);
        return TestUtil.getFileAsURL(nextPageUrl);
      }


      @Override
      public boolean hasNext() {
        return true;
      }


    };
    List<String> ids = scraper.url(GrantsGovTable.getUrl()).pages(1).iterator(pageIterator)
        .extractStrings(scraper.extractor().table(3).links().parameter("oppId").getResults()).getResults();


    assertThat(ids.size(), is(40));
    log.info("ids {} found: {}", ids.size(), ids);
  }


  @Test
  public void extractFieldsFromTable() throws IOException {
    Scraper scraper = new Scraper();
    List<Field> opportunities = scraper.url(DetailPage.getUrl())
        .extract(scraper.extractor().field("title", HTMLElementName.H1).getFields()).getFields();


    assertThat(opportunities.size(), is(greaterThan(0)));
    log.debug("fields = {}", opportunities);


  }


  @Test
  public void extractFieldsFromTableAndTitleFromH1() throws IOException {
    Scraper scraper = new Scraper();
    List<Field> opportunities = scraper.url(DetailPage.getUrl())
        .extract(scraper.extractor().field("title", HTMLElementName.H1).getFields())
        .extract(scraper.extractor().table(4).getFields()).getFields();


    assertThat(opportunities.size(), is(greaterThan(0)));
    assertThat(opportunities.get(0), is(notNullValue()));
    log.debug("fields = {}", opportunities);


  }


  @Test
  public void extractFieldsAfterTablePairedTags() throws MalformedURLException, IOException {
    Scraper scraper = new Scraper();
    List<Field> fields = scraper
        .url(DetailPage.getUrl())
        .extract(
            scraper.extractor().after(HTMLElementName.TABLE, 5)
                .pair(HTMLElementName.H4, HTMLElementName.DD).getFields()).getFields();


    assertThat(fields.size(), is(greaterThan(0)));
    assertThat(ScraperUtil.getFieldValue(fields, "description").startsWith("The focus of this two-year program"),
        is(true));
    log.debug("fields = {}", fields);


  }


  @Test
  public void extractFieldsBasedOnPairedTags() throws MalformedURLException, IOException {
    Scraper scraper = new Scraper();
    List<Field> fields = scraper.url(DetailPage.getUrl())
        .extract(scraper.extractor().pair(HTMLElementName.H4, HTMLElementName.DD).getFields()).getFields();


    assertThat(fields.size(), is(greaterThan(0)));
    log.debug("fields = {}", fields);


  }


  @Test
  public void extractFieldWithMultipleValues() throws MalformedURLException, IOException {
    Scraper scraper = new Scraper();
    List<Field> fields = scraper.url(DetailPage.getUrl())
        .extract(scraper.extractor().pair(HTMLElementName.H4, HTMLElementName.DD).getFields()).getFields();


    assertThat(fields.size(), is(greaterThan(0)));


    String[] eligibilityCodes = ScraperUtil.getFieldValue(fields, "Eligible Applicants").split(";");
    assertThat(eligibilityCodes.length, is(greaterThan(1)));
    for (int i = 0; i < eligibilityCodes.length; i++) {
      log.debug("eligibility code: {}", eligibilityCodes[i]);
    }


    log.debug("eligibility codes: {}", eligibilityCodes);


    fields = scraper.url(TableWithMultipleValuesOnMultipleRows.getUrl()).extract(scraper.extractor().getFields())
        .getFields();


    log.info("fields from table with multiple values on rows: {}", fields);


    Field cfdaNumbers = null;
    for (Field field : fields) {
      if (field.getLabel().equals("CFDA Number(s)")) {
        cfdaNumbers = field;
      }
    }


    assertThat(fields.size(), is(1));
    assertThat(cfdaNumbers, is(notNullValue()));
    assertThat(cfdaNumbers.getValue().contains(";"), is(true));


  }


  @Test
  public void extractFieldFromLargeDetailPage() throws MalformedURLException {
    Scraper scraper = new Scraper();
    List<Field> fields = scraper.url(DetailPage.getUrl()).getFields();


    log.info("fields in detail page: {}", fields.size());
    assertThat(fields.size(), is(greaterThan(0)));


    fields = scraper.url(PagedListingTable.getUrl()).getFields();


    assertThat(fields.size(), is(0)); // for now don't support extracting
                      // fields from listing tables


  }


  @Test
  public void extractFieldsBasedOnPairedTagsAfterAnotherTag() throws MalformedURLException, IOException {
    Scraper scraper = new Scraper();
    List<Field> fields = scraper
        .url(DetailPage.getUrl())
        .extract(
            scraper.extractor().after(HTMLElementName.TABLE, 5)
                .pair(HTMLElementName.H4, HTMLElementName.DD).getFields()).getFields();


    assertThat(fields.size(), is(greaterThan(0)));
    log.debug("paired tags returned fields: {}", fields);
    assertThat(ScraperUtil.getFieldValue(fields, "Eligible Applicants"), is(notNullValue()));


    assertThat(fields.get(fields.size() - 1).getValue().contains("mailto"), is(true));


  }


  @Test
  @Ignore
  public void useIteratedListingAndDetailInterface() throws IOException {
    String listingTableKeyword = "Opportunity Title";
    String linkPattern = "mode=VIEW";
    Scraper scraper = new Scraper();
    Iterator pageIterator = new Iterator() {
      private int currentPage = 2;


      @Override
      public URL next() {
        String nextPageUrl = MessageFormat.format("/testpages/ids-page-{0}.html", currentPage++);
        log.debug("next page to iterate to: {}", nextPageUrl);
        return TestUtil.getFileAsURL(nextPageUrl);
      }


      @Override
      public boolean hasNext() {
        return true;
      }


      @Override
      public URL getBaseUrl() {
        return null;
      }
    };
    Scraper detailScraper = new Scraper();
    detailScraper.extractor().setUseDefaultFieldExtractor(false);
    List<Record> records = scraper
        .url(PagedListingTable.getUrl())
        .pages(2)
        .iterator(pageIterator)
        .listing(
            scraper.extractor().setUseDefaultFieldExtractor(false).table(listingTableKeyword).links()
                .matching(linkPattern).getFields()).detail(detailScraper).getRecords();


    assertThat(records.size(), is(greaterThan(0)));
    log.debug("fields = {}", records);


  }


  @Test
  public void extractBuyerAndOfficeInformation() throws MalformedURLException, IOException {
    Scraper scraper = new Scraper();
    List<Field> fields = scraper.url(ProgramDetailPage.getUrl())
        .extract(scraper.extractor().ofClass("agency-name").getFields()).getFields();


    String agency = ScraperUtil.getFieldValue(fields, "Agency");
    String office = ScraperUtil.getFieldValue(fields, "Office");


    log.info("agency: {}", agency);
    log.info("office: {}", office);


    assertThat(agency, is("Department of Agriculture"));
    assertThat(office, is("Agricultural Research Service"));


  }


  @Test
  public void extractContactInfoFromClass() throws MalformedURLException, IOException {
    String officeInfoID = "dnf_class_values_cfda__hq_office_info__widget";
    String contactInfoFromPage = "Kathleen S. Townson, 5601 Sunnyside Ave, MS-5110, Betsville, Maryland 20705 Email: kathleen.townson@ars.usda.gov Phone: (301) 504-1702";
    Scraper scraper = new Scraper();
    String contactInfo = scraper.url(ProgramDetailPage.getUrl())
        .extract(scraper.extractor().id(officeInfoID).execute()).getResult();


    log.info("contactinfo: {}", contactInfo);
    assertThat(contactInfo.contains(contactInfoFromPage), is(true));
  }


}
Source Code of com.ontometrics.scraper.legacy.ScraperTest

Related Classes of com.ontometrics.scraper.legacy.ScraperTest