Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Tag


        // print and set the title
        LOG.info("KML Element: " + kmlElementNameClean);
        kmlField.setName(kmlElementName);
        kmlField.setNameClean(kmlElementNameClean);

        Tag currentTag = htmlElement.getStartTag().getNextTag().getNextTag();
        while (!currentTag.getName().equals("h2")) {
          if (currentTag.getName().equals("h3")) {
            // here all the magic happens! an
            checkForPossibleSubElements(currentTag, lookForTheseTags, kmlField, kmlElements);
          }

          /*
           * Each element is represented by two tags (start- and end-tag). And because there is no method like nextElement, we have to skip
           * two tags, to get to the next element.
           */
          if ((currentTag.getNextTag() == null) || (currentTag.getNextTag().getNextTag() == null)) {
            LOG.info("-> break");
            break;
          }
          currentTag = currentTag.getNextTag().getNextTag();
        }

        // put current element into the map of kmlElements (but only if a description is set)
        if (kmlField.getDescription().size() == 0) {
          continue;
View Full Code Here


   * @return
   */
  private static Tag checkForPossibleSubElements(final Tag ownerTag, final ArrayList<String> lookForTheseTags, final KMLReferenceField kmlField,
      final HashMap<String, KMLReferenceField> kmlElements) {

    Tag currentTag = ownerTag.getElement().getEndTag().getNextTag();
    final String h3Header = ownerTag.getElement().getTextExtractor().toString().trim().toLowerCase();

    // if current tag is a h3-tag, check of element-properties
    while (!currentTag.getName().equals("h3")) {

      for (final String foundpossibleSubTag : lookForTheseTags) {
        // if possible substring iss found and is not annotated as "back-to-top" then examine the element-kind
        if (currentTag.getName().equals(foundpossibleSubTag) && !"backtotop".equals(currentTag.getElement().getAttributeValue("class"))) {

          final String textFromCurrentElement = currentTag.getElement().getTextExtractor().toString().trim();
          // if there is no text in the current element -> skip to the next tag
          if (textFromCurrentElement.length() == 0) {
            continue;
          }

          /*
           * <h3>Syntax</h3>-Element found? The syntax-element dows usually contain one <pre>-tag containing the basic syntax of the
           * kml-element
           */
          if (h3Header.startsWith("syntax")) {
            // LOG.info("Syntax:     \t" + textFromCurrentElement);
            for (final Element elementPre : currentTag.getChildElements()) {
              kmlField.addToSyntax(elementPre.toString());
            }
          }

          /*
           * <h3>Description</h3>-Element found? The description-element contains one or more <p>-tags, which descripe the usage of the
           * kml-element
           */
          else if (h3Header.startsWith("description") || foundpossibleSubTag.equals("h4")) {
            LOG.info("Description: \t" + textFromCurrentElement);
            kmlField.addToDescription(textFromCurrentElement);
          }

          /*
           * <h3>Extends</h3>-Element found? if only one element is listed it is encapsulated in a <p>-tag and if it extends more elements
           * it's encapsulated in an <ul>-tag as unordered list
           */
          else if (h3Header.startsWith("extends")) {
            if (currentTag.getName().equals("p")) {
              kmlField.addToExtend(textFromCurrentElement);
              LOG.info("Extends:     \t" + textFromCurrentElement);
            } else if (currentTag.getName().equals("ul")) {
              for (final Element elementUL : currentTag.getElement().getChildElements()) {

                for (final Element elementLI : elementUL.getChildElements()) {
                  kmlField.addToExtend(elementLI.getTextExtractor().toString());
                  LOG.info("Extends:     \t" + elementLI.getTextExtractor().toString());
                }
              }
            } else {
              // LOG.info("Extends EL:     \t----------- damn");
            }
          }

          /*
           * <h3>Extended By</h3>-Element found? if only one element is listed it is encapsulated in a <p>-tag and if it extends more
           * elements it's encapsulated in an <ul>-tag as unordered list
           */
          else if (h3Header.startsWith("extended by")) {
            if (currentTag.getName().equals("p")) {
              kmlField.addToExtendedBy(textFromCurrentElement);
              LOG.info("Extended By: \t" + textFromCurrentElement);
            } else if (currentTag.getName().equals("ul")) {
              for (final Element elementUL : currentTag.getElement().getChildElements()) {
                // text in ul we want is encapsulated in <a>- or
                // in <strong>-tag
                LOG.info("Extended By: \t" + elementUL.getStartTag().getNextTag().getElement().getTextExtractor().toString());
                kmlField.addToExtendedBy(elementUL.getStartTag().getNextTag().getElement().getTextExtractor().toString());
              }
            } else {
              // LOG.info("Extended By EL: \t----------- damn ");
            }
          }

          /*
           * <h3>Elements Specific to</h3>-Element found? the complex case! "elements specific to"-could define new elements or explains
           * elements, which are only use as fields for existings kml-fields
           */
          else if (h3Header.startsWith("elements specific to")) {
            if ((currentTag.getName().equals("p") || currentTag.getName().equals("dl") || currentTag.getName().equals("dt") || currentTag
                .getName().equals("ul"))) {
              final Tag endTag = currentTag.getElement().getEndTag();

              KMLReferenceField foundSpecificToField = null;
              while (currentTag.getEnd() < endTag.getEnd()) {
                // abort condition !h3 needed by <IconStyle> because of wrong nested h3-element (<dl><h3></h3>...)
                if (currentTag.getName().equals("h3")) {
                  LOG.info("!! abort condition !h3 needed by <IconStyle> because of wrong nested h3-element (<dl><h3></h3>...)");
                  break;
                }
View Full Code Here

        }

        superNode.getChildren().add(new HtmlBlockNode(text.substring(pos, markdownElement.getBegin())));

        String indent = getIndent(text, markdownElement);
        Tag endTag = getEndTag(text, source, markdownElement, indent);
        String innerText = getInnerText(text, markdownElement, endTag, indent);
        List<Node> children = parseInnerText(innerText);

        superNode.getChildren().add(new MarkdownInsideHtmlBlockNode(
            markdownElement.getStartTag().toString(),
            children,
            endTag.toString()
        ));

        return createMarkdownInsideHtmlBlockNode(text, endTag.getEnd(), source, superNode);
    }
View Full Code Here

    }

  private Field extractFieldByDetectingTagWrapper(Element liElement) {
    Field found = null;
    if (liElement.getAllTags().size() == 4) {
      Tag enclosingTag = liElement.getAllTags().get(1);
      log.info("enclosing tag: {}", enclosingTag);
      log.info("first element of enclosing tag: {}", enclosingTag.getElement().getTextExtractor().toString());
      String tagText = enclosingTag.getElement().getTextExtractor().toString();
      String allText = liElement.getTextExtractor().toString();
      log.info("enclosing tag text starts at: {}", allText.indexOf(tagText));
      log.debug("tagText: {} alltext: {}", tagText, allText);
      if (allText.startsWith(tagText)) {
        found = new ScrapedField(tagText, allText.substring(tagText.length() + 1));
View Full Code Here

      if (parts.length == 2) {
        Field field = new ScrapedField(parts[0], parts[1]);
        extractedFields.add(field);
        log.debug("found <li> to process: {}, added field: {}", li, field);
      } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) {
        Tag enclosingTag = li.getAllTags().get(1);
        log.debug("enclosing tag: {}", enclosingTag);
        log
            .debug("first element of enclosing tag: {}", enclosingTag
                .getElement()
                .getTextExtractor()
                .toString());
        String tagText = enclosingTag
            .getElement()
            .getRenderer()
            .setMaxLineLength(Integer.MAX_VALUE)
            .toString()
            .trim()
View Full Code Here

  }

  private List<Tag> tagsWithSpecificTagRemoved(String tagNameToRemove, List<Tag> tags) {
    log.debug("Tag name to remove = {}, tags to operate on = {}", tagNameToRemove, tags);
    for (int i = 0; i < tags.size(); i++) {
      Tag currentTag = tags.get(i);
      log.debug("Current tag name = {}", currentTag.getName());
      if (currentTag.getName() == tagNameToRemove) {
        tags.remove(currentTag);
        i--;
      }
    }
    log.debug("returning tags = {}", tags);
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Tag

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.