Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Source


      log.error("Error getting source in session mode: {}", e);
    } catch (IOException e) {
      log.error("Error getting source in session mode: {}", e);
    }

    Source source = new Source(responseBody);
    return source;
  }
View Full Code Here


      }
    }
  }

  public static List<Link> extractLinks(String sourceToParse) {
    Source source = new Source(sourceToParse);
    source.fullSequentialParse();
    List<Link> links = new ArrayList<Link>();
    List<Element> as = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : as) {
      links.add(new Link(linkElement.getTextExtractor().toString(), linkElement.getAttributeValue("href")));
    }
    return links;
  }
View Full Code Here

    return result;
  }

  public static String extractUsingIdentifier(String html, TagOccurrence tagOccurrence) {
    String result = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.ID) {
      log.debug("extracting tag by id: {}", tagOccurrence.getIdentifier());
      Element idElement = source.getElementById(tagOccurrence.getIdentifier());
      if (idElement != null) {
        result = idElement.toString();
      } else {
        result = "";
      }
    } else if (tagOccurrence.getElementIdentifierType() == ElementIdentifierType.cssClass) {
      log.debug("extracting: {}", tagOccurrence);
      List<Element> elements = source.getAllElementsByClass(tagOccurrence.getIdentifier());
      if(elements.size() > tagOccurrence.getOccurrence())
        result = elements.get(tagOccurrence.getOccurrence()).toString();
    }
    log.debug("identifier: {}/{} result: {}",
        new Object[] { tagOccurrence.getIdentifier(), tagOccurrence.getElementIdentifierType(), result });
View Full Code Here

  }

  public static String extractTagMatching(String html, TagOccurrence toGet) {
    log.debug("looking for {} in tags: {}", toGet.getMatching(), toGet.getTag());
    String found = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    log.debug("source = {}", source);
    List<Element> elements = source.getAllElements(HTMLElementName.TABLE);
    for (Element element : elements) {
      log.debug("this element = {}", element);
      String elementText = element.getTextExtractor().toString();
      if (elementText.contains(toGet.getMatching())) {
        found = element.toString();
View Full Code Here

    return tag;
  }

  public static String extractSessionId(URL url, String sessionIDName) throws IOException {
    String sessionID = null;
    Source source = new Source(url);
    source.fullSequentialParse();
    List<Element> links = source.getAllElements(HTMLElementName.A);
    for (Element link : links) {
      // log.info("link: {}", link.toString());
      String href = link.getAttributeValue("href");
      if (href != null && href.contains(sessionIDName)) {
        sessionID = extractParameter(href, sessionIDName);
View Full Code Here

    if (result == null) {
      throw new IllegalStateException("Manipulator " + this.getClass().getName() + " returned null.");
    }
    if (type == OperationType.Manipulator) {
      log.debug("reassigning source..");
      Source newSource = new Source(result);
      newSource.fullSequentialParse();
      this.source = newSource;
    }
    if (successor != null) {
      successor.execute(this.source);
    }
View Full Code Here

    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }

  private String delimitFieldValues(String source) {
    Source result = new Source(source.replace("<br>", ";").replace("<br/>", ";"));
    return getValueFieldText(result.getFirstElement());
  }
View Full Code Here

  }

  private String extractTagMatching(String html, TagOccurrence toGet) {
    log.debug("looking for {} in tags: {}", toGet.getMatching(), toGet.getTag());
    String found = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> elements = source.getAllElements(HTMLElementName.TABLE);
    for (Element element : elements) {
      String elementText = element.getTextExtractor().toString();
      if (elementText.contains(toGet.getMatching())) {
        found = element.toString();
      }
View Full Code Here

   * @throws IOException
   */
  private List<Field> defaultFieldExtractions() throws IOException {
    List<Field> extractedFields = new ArrayList<Field>();

    Source source = null;
    try {
      source = new Source(this.url);
    } catch (FileNotFoundException e) {
      log.info("Error while sourcing URL = {}, error description = {}", this.url, e.toString());
      return new ArrayList<Field>();
    }
    source.fullSequentialParse();

    List<Element> tables = source.getAllElements(HTMLElementName.TABLE);

    for (Element table : tables) {
      extractedFields.addAll(extractFieldsFromTable(table.toString()));
    }

    List<Element> dls = source.getAllElements(HTMLElementName.DL);

    for (Element dt : dls) {
      extractedFields.addAll(extractFieldsFromDL(dt.toString()));
    }

View Full Code Here

  }

  private List<Field> extractFieldsFromTable(String html) {
    // log.debug("extracting fields from table: {}", html);
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> cells = source.getAllElements(HTMLElementName.TD);
    int rows = source.getAllElements(HTMLElementName.TR).size();
    log.debug("found {} cells in {} rows", cells.size(), rows);
    if (cells.size() == (rows * 2)) {
      Field lastField = null;
      log.debug("cells.size: {}", cells.size());
      for (int i = 0; i < cells.size(); i++) {
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.