Package info.bliki.htmlcleaner

Examples of info.bliki.htmlcleaner.TagNode


                        // ignore lists and tables since they most of the time
                        // do not hold grammatically correct
                        // interesting sentences that are representative of the
                        // language.
                    } else if (node instanceof TagNode) {
                        TagNode tagNode = (TagNode) node;
                        Map<String, String> attributes = tagNode.getAttributes();
                        Map<String, Object> oAttributes = tagNode.getObjectAttributes();
                        boolean hasSpecialHandling = false;
                        String tagName = tagNode.getName();
                        int tagBegin = countingBuffer.currentPosition;
                        if ("a".equals(tagName)) {
                            String href = attributes.get(HREF_ATTR_KEY);
                            if (href != null
                                    && INTERWIKI_PATTERN.matcher(href).matches()) {
                                // ignore the interwiki links since they are
                                // mostly used for translation purpose.
                                hasSpecialHandling = true;
                            }
                        } else if ("ref".equals(tagName)) {
                            // ignore the references since they do not hold
                            // interesting text content
                            hasSpecialHandling = true;
                        } else if (oAttributes != null
                                && oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) {
                            // the caption of images often holds well formed
                            // sentences with links to entities
                            hasSpecialHandling = true;
                            ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
                            imageNodeToText(tagNode, iformat, countingBuffer,
                                    model);
                        }
                        if (!hasSpecialHandling) {
                            nodesToText(tagNode.getChildren(), countingBuffer,
                                    model);
                        }
                        if (PARAGRAPH_TAGS.contains(tagName)) {
                            paragraphs.add(new Annotation(tagBegin,
                                    countingBuffer.currentPosition,
View Full Code Here


    StringBuilder resultBuffer = new StringBuilder(fInputHTML.length());
    try {
      cleaner = new HtmlCleaner(fInputHTML);
      cleaner.clean();
      // resultBuffer.append(cleaner.getXmlAsString());
      TagNode body = cleaner.getBodyNode();
      converter.nodeToWiki(body, resultBuffer);
    } catch (IOException e) {
    }
    return resultBuffer.toString();
  }
View Full Code Here

      String content = contentToken.getContent();
      // content = content.replaceAll("&nbsp;", " ");
      content = StringUtils.replace(content, "&nbsp;", " ");
      wikiText.append(content);
    } else if (node instanceof TagNode) {
      TagNode tagNode = (TagNode) node;

      String name = tagNode.getName();
      HTMLTag tag = (HTMLTag) fHashMap.get(name);

      if (tag != null) {
        boolean showWithoutTag = false;
        if (fNoDiv && name.equals("div")) {
          showWithoutTag = true;
        }
        if (fNoFont && name.equals("font")) {
          showWithoutTag = true;
        }
        tag.content(this, tagNode, wikiText, showWithoutTag);
      } else {
        List children = tagNode.getChildren();
        if (children.size() != 0) {
          nodesToText(children, wikiText);
        }
      }
    }
View Full Code Here

      ContentToken contentToken = (ContentToken) node;
      String content = contentToken.getContent();
      content = content.replaceAll("&nbsp;", " ");
      plainText.append(content);
    } else if (node instanceof TagNode) {
      TagNode tagNode = (TagNode) node;
      List children = tagNode.getChildren();
      if (children.size() != 0) {
        nodesToPlainText(children, plainText);
      }
    }
  }
View Full Code Here

      String content = contentToken.getContent();
      // content = content.replaceAll("&nbsp;", " ");
      content = StringUtils.replace(content, "&nbsp;", " ");
      wikiText.append(content);
    } else if (node instanceof TagNode) {
      TagNode tagNode = (TagNode) node;

      String name = tagNode.getName();
      HTMLTag tag = (HTMLTag) fHashMap.get(name);

      if (tag != null) {
        boolean showWithoutTag = false;
        if (fNoDiv && name.equals("div")) {
          showWithoutTag = true;
        }
        if (fNoFont && name.equals("font")) {
          showWithoutTag = true;
        }
        tag.content(this, tagNode, wikiText, showWithoutTag);
      } else {
        List children = tagNode.getChildren();
        if (children.size() != 0) {
          nodesToText(children, wikiText);
        }
      }
    }
View Full Code Here

      ContentToken contentToken = (ContentToken) node;
      String content = contentToken.getContent();
      content = content.replaceAll("&nbsp;", " ");
      plainText.append(content);
    } else if (node instanceof TagNode) {
      TagNode tagNode = (TagNode) node;
      List children = tagNode.getChildren();
      if (children.size() != 0) {
        nodesToPlainText(children, plainText);
      }
    }
  }
View Full Code Here

  public void append(BaseToken contentNode) {
    fTagStack.append(contentNode);
  }

  public void appendExternalImageLink(String imageSrc, String imageAltText) {
    TagNode spanTagNode = new TagNode("span");
    append(spanTagNode);
    spanTagNode.addAttribute("class", "image", true);
    TagNode imgTagNode = new TagNode("img");
    spanTagNode.addChild(imgTagNode);
    imgTagNode.addAttribute("src", imageSrc, true);
    imgTagNode.addAttribute("alt", imageAltText, true);
    // "nofollow" keyword is not allowed for XHTML
    // imgTagNode.addAttribute("rel", "nofollow", true);
  }
View Full Code Here

    // || ext.equalsIgnoreCase("bmp")) {
    // appendExternalImageLink(link, linkName);
    // return;
    // }
    // }
    TagNode aTagNode = new TagNode("a");
    aTagNode.addAttribute("href", link, true);
    aTagNode.addAttribute("class", "externallink", true);
    aTagNode.addAttribute("title", link, true);
    aTagNode.addAttribute("rel", "nofollow", true);
    if (withoutSquareBrackets) {
      append(aTagNode);
      aTagNode.addChild(new ContentToken(linkName));
    } else {
      String trimmedText = linkName.trim();
      if (trimmedText.length() > 0) {
        pushNode(aTagNode);
        WikipediaParser.parseRecursive(trimmedText, this, false, true);
View Full Code Here

  public void appendInternalImageLink(String hrefImageLink, String srcImageLink, ImageFormat imageFormat) {
    int pxWidth = imageFormat.getWidth();
    int pxHeight = imageFormat.getHeight();
    String caption = imageFormat.getCaption();
    TagNode divTagNode = new TagNode("div");
    divTagNode.addAttribute("id", "image", false);
    // String link = imageFormat.getLink();
    // if (link != null) {
    // String href = encodeTitleToUrl(link, true);
    // divTagNode.addAttribute("href", href, false);
    // } else {
    if (hrefImageLink.length() != 0) {
      divTagNode.addAttribute("href", hrefImageLink, false);
    }
    // }
    divTagNode.addAttribute("src", srcImageLink, false);
    divTagNode.addObjectAttribute("wikiobject", imageFormat);
    if (pxHeight != -1) {
      if (pxWidth != -1) {
        divTagNode.addAttribute("style", "height:" + pxHeight + "px; " + "width:" + pxWidth + "px", false);
      } else {
        divTagNode.addAttribute("style", "height:" + pxHeight + "px", false);
      }
    } else {
      if (pxWidth != -1) {
        divTagNode.addAttribute("style", "width:" + pxWidth + "px", false);
      }
    }
    pushNode(divTagNode);

    String imageType = imageFormat.getType();
    // TODO: test all these cases
    if (caption != null && caption.length() > 0
        && ("frame".equals(imageType) || "thumb".equals(imageType) || "thumbnail".equals(imageType))) {

      TagNode captionTagNode = new TagNode("div");
      String clazzValue = "caption";
      String type = imageFormat.getType();
      if (type != null) {
        clazzValue = type + clazzValue;
      }
      captionTagNode.addAttribute("class", clazzValue, false);
      //     
      TagStack localStack = WikipediaParser.parseRecursive(caption, this, true, true);
      captionTagNode.addChildren(localStack.getNodeList());
      String altAttribute = imageFormat.getAlt();
      if (altAttribute == null) {
        altAttribute = captionTagNode.getBodyString();
        imageFormat.setAlt(altAttribute);
      }
      pushNode(captionTagNode);
      // WikipediaParser.parseRecursive(caption, this);
      popNode();
View Full Code Here

    if (replaceColon()) {
      encodedtopic = encodedtopic.replace(':', '/');
    }
    hrefLink = hrefLink.replace("${title}", encodedtopic);

    TagNode aTagNode = new TagNode("a");
    // append(aTagNode);
    aTagNode.addAttribute("href", hrefLink, true);
    // aTagNode.addChild(new ContentToken(linkText));
    pushNode(aTagNode);
    WikipediaParser.parseRecursive(linkText.trim(), this, false, true);
    popNode();
  }
View Full Code Here

TOP

Related Classes of info.bliki.htmlcleaner.TagNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.