Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Source


    for (String field : fields) {
      String stringToRender = (String) doc.getContentField(field);
      if (stringToRender == null || stringToRender.isEmpty()) {
        continue;
      } else {
        Source source = new Source(stringToRender);
        String renderedString = source.getRenderer().toString();
        doc.putContentField(field, renderedString);
      }
    }
  }
View Full Code Here


    return formatted;

  }

  public String changeTagCase(String contents, boolean uppercase) {
    Source source = new Source(contents);
    source.fullSequentialParse();
    OutputDocument outputDocument = new OutputDocument(source);
    List<Tag> tags = source.getAllTags();
    int pos = 0;
    for (Tag tag : tags) {
      Element tagElement = tag.getElement();
      if (tagElement == null) {
        System.out.println(tag.getName());
View Full Code Here

    assertThat(result, is("Teton Science School"));
  }

  @Test
  public void extractLinksWithText() throws IOException {
    Source source = new Source(TableOfLinks.getUrl());
    source.fullSequentialParse();
    List<Link> links = ScraperUtil.extractLinks(source.toString());
    log.info("found following links in table: {}", links);
  }
View Full Code Here

  @Test
  public void canParseLIWithStrong() {
    String li = "<li><strong> Minimum Term&nbsp;&nbsp;&nbsp;</strong> &nbsp;</li>";

    Source source = new Source(li);
    source.fullSequentialParse();

    String[] parsedOnClosingTag = source.toString().split("</");

    log.info("split on close tag: {} and {}", parsedOnClosingTag[0], parsedOnClosingTag[1]);
    Element liElement = source.getAllElements(HTMLElementName.LI).get(0);
    log.info("li: {}", liElement);
    log.info("li tags: {}", liElement.getAllTags());
    Field field = extractFieldByDetectingTagWrapper(liElement);
    log.info("found field: {}", field);
View Full Code Here

    }

    @Test
    public void canPassParsedSourceToExtractor(){
        Source source = html().url(CareerBuilderDetailPage.getUrl()).getSource();

        List<Field> fields = new DefaultFieldExtractor()
                .source(source)
                .field("jobTitle", ElementIdentifierType.cssClass, "job_title")
                .field("jobDescription", ElementIdentifierType.cssClass, "job_desc")
View Full Code Here

    return found;
  }

    @Test
    public void canGetTagAttributeFields(){
        Source source = html().url(CareerBuilderDetailPage.getUrl()).getSource();

        List<Field> fields = new DefaultFieldExtractor()
                .source(source)
                .add(new DefaultTagAttributeFieldExtractor("meta", "name", "content"))
                .getFields();
View Full Code Here

  }

  @Test
  public void canExtractValueOfAnAttribute() {
    attributeExtractor.execute(new Source("<span onchange=\"" + targetValue  + "\">"));
    assertThat(mockExtractionAgent.getResult().toString(), is(targetValue));
  }
View Full Code Here

  }

    @Test
    public void canGetSourceThenChainWithHtmlExtractor(){
        Source source = html().url(ProgramDetailPage.getUrl()).getSource();

        log.info("source: {}", source.toString());
        assertThat(source, notNullValue());

        String text = new TextExtractor().source(html().source(source).ofClass(eligibilityClassName, 1)).getText();

        log.info("text extracted from prefetched source: {}", text);
View Full Code Here

  }

  @Test
  public void canSpliceSource() {

    Source pageSource = GrantsGovTable.getSource();
    assertThat(pageSource, is(notNullValue()));
    splicingExtractor.execute(pageSource);
    Source result = mockExtractionAgent.getResult();

    log.info("result after splice: {}", result);

    assertThat(result.length(), is(greaterThan(0)));
    assertThat(result.toString().startsWith("<table"), is(true));

  }
View Full Code Here

        .build());

    beginExtractor.setSuccessor(endExtractor);
    endExtractor.setSuccessor(mockExtractionAgent);

    Source pageSource = GrantsGovTable.getSource();
    assertThat(pageSource, is(notNullValue()));
    beginExtractor.execute(pageSource);
    Source result = mockExtractionAgent.getResult();

    log.info("result after splice between: {}", result);

    assertThat(result.length(), is(greaterThan(0)));
    // assertThat(result.toString().startsWith("<table"), is(true));

  }
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.