Package com.ontometrics.scraper.extraction

Source Code of com.ontometrics.scraper.extraction.TextExtractorTest

package com.ontometrics.scraper.extraction;

import static com.ontometrics.scraper.HtmlSample.ProgramDetailPage;
import static com.ontometrics.scraper.extraction.HtmlExtractor.html;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;

import net.htmlparser.jericho.Source;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TextExtractorTest {

  private static final Logger log = LoggerFactory.getLogger(TextExtractorTest.class);

  private String eligibilityClassName = "fld_applicant_eligibility";

  @Test
  public void getTextFromElement() {
    String text = new TextExtractor().source(
        html().url(ProgramDetailPage.getUrl()).ofClass(eligibilityClassName, 1)).getText();

    log.info("text from class: {} is: {}", eligibilityClassName, text);

  }

    @Test
    public void canGetSourceThenChainWithHtmlExtractor(){
        Source source = html().url(ProgramDetailPage.getUrl()).getSource();

        log.info("source: {}", source.toString());
        assertThat(source, notNullValue());

        String text = new TextExtractor().source(html().source(source).ofClass(eligibilityClassName, 1)).getText();

        log.info("text extracted from prefetched source: {}", text);

        assertThat(text, is("Applicant Eligibilty: 39-Anyone/general public/ 52-Libraries/lnformation/Statistics"));

    }

  @Test
  public void canGetAttributeOfElement() {
    String className = "lst-pp";
    Object attributeName = "onchange";
    String text = new TextExtractor()
        .source(html()
            .url(ProgramDetailPage.getUrl())
            .ofClass(eligibilityClassName, 1)
            .attribute(attributeName))
        .getText();

    log.info("attribute {} from class: {} is: {}", new Object[] { className, attributeName, text });

  }

}
TOP

Related Classes of com.ontometrics.scraper.extraction.TextExtractorTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.