Package org.htmlparser.util

Examples of org.htmlparser.util.NodeList


    StringBuffer text = new StringBuffer();
    Parser parser = new Parser();
    parser.setInputHTML(html);
    parser.setEncoding(Globals.ENC_8859_1);
    //�������еĽڵ�
    NodeList nodes;
    try {
      nodes = parser.extractAllNodesThatMatch(nfilter);
    } catch (ParserException e) {
      return html;
    }
    for(int i=0;i<nodes.size();i++){
      TextNode node = (TextNode)nodes.elementAt(i);
      text.append(node.getText());
    }
    return StringUtils.remove(text.toString(),"&nbsp;");
  }
View Full Code Here


    Parser parser = new Parser();
    StringBuffer prvContent = new StringBuffer();
    try {
      parser.setEncoding(Globals.ENC_8859_1);
      parser.setInputHTML(html);
      NodeList nodes = parser.extractAllNodesThatMatch(nfilter);
      Node node = null;
      for(int i=0;i<nodes.size();i++){
        if(prvContent.length() >= max_count){
          if(node instanceof TagNode){
            TagNode tmp_node = (TagNode)node;
            boolean isEnd = tmp_node.isEndTag();
            if(!isEnd){
              prvContent.setLength(prvContent.length()-tmp_node.getText().length()-2);
            }
          }
          //��������δ�رյı�ǩ
          Node parent = node;
          //System.out.println("current node is . "+parent.getText());
          do{
            parent = parent.getParent()
            //System.out.println("parent = "+parent);         
            if(parent==null) break;
            if(!(parent instanceof TagNode)) continue;
            //System.out.println("Parent node is no ended. "+parent.getText());
            prvContent.append(((TagNode)parent).getEndTag().toHtml());
          }while(true);
          break;
        }
        node = nodes.elementAt(i);
        if(node instanceof TagNode){
          TagNode tag = (TagNode)node;
          prvContent.append('<');
          prvContent.append(tag.getText());
          prvContent.append('>');
View Full Code Here

    StringBuffer text = new StringBuffer();
    Parser parser = new Parser();
    parser.setInputHTML(new String(inputHtml.getBytes(),ISO8859_1));
    //Parser parser = Parser.createParser(new String(inputHtml.getBytes(),ISO8859_1));
    //�������еĽڵ�
    NodeList nodes = parser.extractAllNodesThatMatch(nodeFilter);
    for(int i=0;i<nodes.size();i++){
      Node node = nodes.elementAt(i);
      text.append(new String(node.toPlainTextString().getBytes(ISO8859_1)));
    }
    return text.toString();
  }
View Full Code Here

      StringBuffer text = new StringBuffer();
      Parser parser = new Parser();
      parser.setEncoding("8859_1");
      parser.setInputHTML(html);
      //�������еĽڵ�
      NodeList nodes = parser.extractAllNodesThatMatch(scriptFilter);     
      for(int i=0;i<nodes.size();i++){
        Node node = nodes.elementAt(i);
        if(node instanceof TextNode)
          text.append(node.getText());
        else{
          text.append('<');
          text.append(node.getText());
View Full Code Here

      oTextHtmlAlt.addBodyPart(oMsgPlainText);

      // *****************************************
      // Iterate images from HTML and replace CIDs

      NodeList oCollectionList = new NodeList();
      TagNameFilter oImgFilter = new TagNameFilter ("IMG");
      for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
        e.nextNode().collectInto(oCollectionList, oImgFilter);

      final int nImgs = oCollectionList.size();

      if (DebugFile.trace) DebugFile.writeln("NodeList.size() = " + String.valueOf(nImgs));

      for (int i=0; i<nImgs; i++) {

        sSrc = ((ImageTag) oCollectionList.elementAt(i)).extractImageLocn();

        // Keep a reference to every related image name so that the same image is not included twice in the message
        if (!oDocumentImages.containsKey(sSrc)) {

          // Find last slash from image url
View Full Code Here

   
    int iSlash;
    Parser oPrsr;
    String sCid, sSrc;
    String sBodyCid = sBody;
    NodeList oCollectionList;
    TagNameFilter oImgFilter;

  // **********************************************************************
  // Replace <IMG SRC="..." >

    oPrsr = Parser.createParser(sBodyCid, sEnc);
   
    oCollectionList = new NodeList();
    oImgFilter = new TagNameFilter ("IMG");
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oImgFilter);

    int nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Images NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {
    ImageTag oImgTag = (ImageTag) oCollectionList.elementAt(i);
     
        sSrc = oImgTag.extractImageLocn().replace('\\','/');
   
    if (DebugFile.trace) DebugFile.writeln("Processing image location "+sSrc);
   
        // Keep a reference to every related image name so that the same image is not included twice in the message
        if (!oImgs.containsKey(sSrc)) {

          // Find last slash from image url
          iSlash = sSrc.lastIndexOf('/');
     
          // Take image name
          if (iSlash>=0) {
            while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
              sCid = sSrc.substring(iSlash);
          }
          else {
            sCid = sSrc;
          }
          if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

          oImgs.put(sSrc, sCid);
        } // fi (!oImgs.containsKey(sSrc))
       
        sBodyCid = doSubstitution (sBodyCid, "Src", Gadgets.replace(Gadgets.replace(oImgTag.extractImageLocn(),'\\',"\\\\"),'.',"\\x2E"), sPreffix+oImgs.get(sSrc));
    } // next

  // **********************************************************************
  // Replace <TABLE BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTableFilter = new TagNameFilter("TABLE");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTableFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Tables NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

      sSrc = ((TableTag) oCollectionList.elementAt(i)).getAttribute("background");
      if (sSrc!=null) {
        if (sSrc.length()>0) {
          sSrc = sSrc.replace('\\','/');

      if (DebugFile.trace) DebugFile.writeln("Processing background location "+sSrc);

          // Keep a reference to every related image name so that the same image is not included twice in the message
          if (!oImgs.containsKey(sSrc)) {

            // Find last slash from image url
            iSlash = sSrc.lastIndexOf('/');

            // Take image name
            if (iSlash>=0) {
              while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
                sCid = sSrc.substring(iSlash);
            } // fi
            else {
              sCid = sSrc;
            }

            if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

            oImgs.put(sSrc, sCid);
          } // fi (!oImgs.containsKey(sSrc))
     
      sBodyCid = doSubstitution (sBodyCid, "Background", Gadgets.replace(Gadgets.replace(((TableTag) oCollectionList.elementAt(i)).getAttribute("background"),'\\',"\\\\"),'.',"\\x2E"), sPreffix+oImgs.get(sSrc));

        } // fi
      } // fi
    } // next

  // **********************************************************************
  // Replace <TD BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTDFilter = new TagNameFilter("TD");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTDFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("TD NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

      sSrc = ((TableColumn) oCollectionList.elementAt(i)).getAttribute("background");
      if (sSrc!=null) {
        if (sSrc.length()>0) {
          sSrc = sSrc.replace('\\','/');

      if (DebugFile.trace) DebugFile.writeln("Processing td bg location "+sSrc);

          // Keep a reference to every related image name so that the same image is not included twice in the message
          if (!oImgs.containsKey(sSrc)) {

            // Find last slash from image url
            iSlash = sSrc.lastIndexOf('/');

            // Take image name
            if (iSlash>=0) {
              while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
                sCid = sSrc.substring(iSlash);
            } // fi
            else {
              sCid = sSrc;
            }

            if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

            oImgs.put(sSrc, sCid);
          } // fi (!oImgs.containsKey(sSrc))
     
      sBodyCid = doSubstitution(sBodyCid, "Background", Gadgets.replace(Gadgets.replace(((TableColumn) oCollectionList.elementAt(i)).getAttribute("background"),'\\',"\\\\"),'.',"\\x2E"), sPreffix+oImgs.get(sSrc));
        } // fi
      } // fi
    } // next

    if (DebugFile.trace) {
View Full Code Here

    int iSlash;
    Parser oPrsr;
    String sCid, sSrc;
    String sBodyCid = sBody;
    NodeList oCollectionList;
    TagNameFilter oImgFilter;

  // **********************************************************************
  // Replace <IMG SRC="..." >

    oPrsr = Parser.createParser(sBodyCid, sEnc);
   
    oCollectionList = new NodeList();
    oImgFilter = new TagNameFilter ("IMG");
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oImgFilter);

    int nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Images NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

        sSrc = (((ImageTag) oCollectionList.elementAt(i)).extractImageLocn()).replace('\\','/');

        // Keep a reference to every related image name so that the same image is not included twice in the message
        if (!oImgs.containsKey(sSrc)) {

          // Find last slash from image url
          iSlash = sSrc.lastIndexOf('/');
     
          // Take image name
          if (iSlash>=0) {
            while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
              sCid = sSrc.substring(iSlash);
          }
          else {
            sCid = sSrc;
          }

          // String sUid = Gadgets.generateUUID();
          // sCid = sUid.substring(0,12)+"$"+sUid.substring(12,20)+"$"+sUid.substring(20,28)+"@hipergate.org";

          if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

          oImgs.put(sSrc, sCid);
        } // fi (!oImgs.containsKey(sSrc))
       
        String sImgSrc = ((ImageTag) oCollectionList.elementAt(i)).extractImageLocn();
        if (sImgSrc.startsWith(sPreffix)) {
          sBodyCid = doSubstitution(sBodyCid, "Src", Gadgets.replace(Gadgets.replace(sImgSrc,'\\',"\\\\"),'.',"\\x2E"), sImgSrc.substring(sPreffix.length()));
        }
       
    } // next

  // **********************************************************************
  // Replace <TABLE BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTableFilter = new TagNameFilter("TABLE");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTableFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Tables NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

      sSrc = ((TableTag) oCollectionList.elementAt(i)).getAttribute("background");
      if (sSrc!=null) {
        if (sSrc.length()>0) {
          sSrc = sSrc.replace('\\','/');

          // Keep a reference to every related image name so that the same image is not included twice in the message
          if (!oImgs.containsKey(sSrc)) {

            // Find last slash from image url
            iSlash = sSrc.lastIndexOf('/');

            // Take image name
            if (iSlash>=0) {
              while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
                sCid = sSrc.substring(iSlash);
            } // fi
            else {
              sCid = sSrc;
            }

            if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

            oImgs.put(sSrc, sCid);
          } // fi (!oImgs.containsKey(sSrc))

          String sBckGrnd = ((TableTag) oCollectionList.elementAt(i)).getAttribute("background");
          if (sBckGrnd.startsWith(sPreffix)) {
            sBodyCid = doSubstitution(sBodyCid, "Background", Gadgets.replace(Gadgets.replace(sBckGrnd,'\\',"\\\\"),'.',"\\x2E"), sBckGrnd.substring(sPreffix.length()));
          }         
         
        } // fi
      } // fi
    } // next

  // **********************************************************************
  // Replace <TD BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTDFilter = new TagNameFilter("TD");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTDFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("TD NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

      sSrc = ((TableColumn) oCollectionList.elementAt(i)).getAttribute("background");
      if (sSrc!=null) {
        if (sSrc.length()>0) {
          sSrc = sSrc.replace('\\','/');

          // Keep a reference to every related image name so that the same image is not included twice in the message
          if (!oImgs.containsKey(sSrc)) {

            // Find last slash from image url
            iSlash = sSrc.lastIndexOf('/');

            // Take image name
            if (iSlash>=0) {
              while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
                sCid = sSrc.substring(iSlash);
            } // fi
            else {
              sCid = sSrc;
            }

            if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

            oImgs.put(sSrc, sCid);
          } // fi (!oImgs.containsKey(sSrc))

          String sTdBckg = ((TableColumn) oCollectionList.elementAt(i)).getAttribute("background");
          if (sTdBckg.startsWith(sPreffix)) {
            sBodyCid = doSubstitution(sBodyCid, "Background", Gadgets.replace(Gadgets.replace(sTdBckg,'\\',"\\\\"),'.',"\\x2E"), sTdBckg.substring(sPreffix.length()));
          }

        } // fi
View Full Code Here

            }
        }
    };

    Parser parser = Parser.createParser(sBody, sEnc);
    NodeList list = parser.parse(null);
    list.visitAllNodesWith(linkVisitor);

    if (DebugFile.trace) {
      DebugFile.decIdent();
      DebugFile.writeln("End HtmlMimeBodyPart.addClickThroughRedirector()");
    }

    return list.toHtml();
  } // addClickThroughRedirector
View Full Code Here

    if(htmlCode == null){
      throw new NullPointerException("Input HTML code string is NULL");
    }   
    try{
      Parser parser = Parser.createParser(htmlCode, "UTF-8");
      final NodeList nl = parser.parse(null);
      nl.visitAllNodesWith(new NodeVisitor(){
        @SuppressWarnings("unchecked")
        public void visitTag(Tag tag){
          if((tag instanceof ScriptTag) ||
             (tag instanceof FrameTag)){
            if(tag.getParent() == null){
              nl.remove(tag);
            }else{
              tag.getParent().getChildren().remove(tag);
            }
          }
          if(tag instanceof HeadingTag){
            if("H1".equalsIgnoreCase(tag.getTagName())||
               "H2".equalsIgnoreCase(tag.getTagName())){
              tag.setTagName("H3");
              tag.getEndTag().setTagName("/H3");
            }
          }
          List<String> attrs2remove = new ArrayList<String>();
          Vector<Attribute> attrs = tag.getAttributesEx();
          for(Attribute attr : attrs){         
            if(attr.getLength() > 2){
              String prefix2Char = attr.getName().substring(0,2);
              if(prefix2Char.equalsIgnoreCase("on")){
                attrs2remove.add(attr.getName());
              }
            }
          }
          for(String a2r : attrs2remove){
            tag.removeAttribute(a2r);
          }
        }
      });     
      return nl.toHtml();
    }catch(Exception e){
      System.out.println(e);
    }
    return "";
  }
View Full Code Here

        for (int a=0; a<nAttrs; a++) {
          Attribute oAttr = (Attribute) oAttrs.get(a);
          oAttr.toString(oBuffer);
        }
        oBuffer.append(">");
        NodeList oChilds = oNode.getChildren();
        if (oChilds!=null) {
          for (NodeIterator i = oNode.getChildren().elements(); i.hasMoreNodes(); ) {
            Node oChildNode = i.nextNode();
            oBuffer.append(parseNode(oChildNode, oCompiler, oMatcher));
          } // next
View Full Code Here

TOP

Related Classes of org.htmlparser.util.NodeList

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.