Package org.htmlparser

Examples of org.htmlparser.Parser


    HashMap oDocumentImages = new HashMap(23);

    StringSubstitution oSrcSubs = new StringSubstitution();

    Parser oPrsr = Parser.createParser(sHtmlBody, sEncoding);

    String sCid, sSrc;

    try {

      if (sTextBody==null) {
          // ****************************
          // Extract plain text from HTML
          if (DebugFile.trace) DebugFile.writeln("new StringBean()");

          StringBean oStrBn = new StringBean();

          try {
            oPrsr.visitAllNodesWith (oStrBn);
          } catch (ParserException pe) {
          throw new MessagingException(pe.getMessage(), pe);
          }

          sTextBody = oStrBn.getStrings();

          oStrBn = null;
      } // fi (sTextBody==null)

      // *******************************
      // Set plain text alternative part

      oMsgPlainText.setDisposition("inline");
      oMsgPlainText.setText(sTextBody, sCharEnc, "plain");
      // oMsgPlainText.setContent(sTextBody, "text/plain; charset="+sCharEnc);
      if (DebugFile.trace) DebugFile.writeln("MimeBodyPart(multipart/alternative).addBodyPart(text/plain)");
      oTextHtmlAlt.addBodyPart(oMsgPlainText);

      // *****************************************
      // Iterate images from HTML and replace CIDs

      NodeList oCollectionList = new NodeList();
      TagNameFilter oImgFilter = new TagNameFilter ("IMG");
      for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
        e.nextNode().collectInto(oCollectionList, oImgFilter);

      final int nImgs = oCollectionList.size();

      if (DebugFile.trace) DebugFile.writeln("NodeList.size() = " + String.valueOf(nImgs));
View Full Code Here


      DebugFile.writeln("Begin HtmlMimeBodyPart.addPreffixToImgSrc("+sPreffix+")");
      DebugFile.incIdent();
    }
   
    int iSlash;
    Parser oPrsr;
    String sCid, sSrc;
    String sBodyCid = sBody;
    NodeList oCollectionList;
    TagNameFilter oImgFilter;

  // **********************************************************************
  // Replace <IMG SRC="..." >

    oPrsr = Parser.createParser(sBodyCid, sEnc);
   
    oCollectionList = new NodeList();
    oImgFilter = new TagNameFilter ("IMG");
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oImgFilter);

    int nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Images NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {
    ImageTag oImgTag = (ImageTag) oCollectionList.elementAt(i);
     
        sSrc = oImgTag.extractImageLocn().replace('\\','/');
   
    if (DebugFile.trace) DebugFile.writeln("Processing image location "+sSrc);
   
        // Keep a reference to every related image name so that the same image is not included twice in the message
        if (!oImgs.containsKey(sSrc)) {

          // Find last slash from image url
          iSlash = sSrc.lastIndexOf('/');
     
          // Take image name
          if (iSlash>=0) {
            while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
              sCid = sSrc.substring(iSlash);
          }
          else {
            sCid = sSrc;
          }
          if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

          oImgs.put(sSrc, sCid);
        } // fi (!oImgs.containsKey(sSrc))
       
        sBodyCid = doSubstitution (sBodyCid, "Src", Gadgets.replace(Gadgets.replace(oImgTag.extractImageLocn(),'\\',"\\\\"),'.',"\\x2E"), sPreffix+oImgs.get(sSrc));
    } // next

  // **********************************************************************
  // Replace <TABLE BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTableFilter = new TagNameFilter("TABLE");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTableFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Tables NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

      sSrc = ((TableTag) oCollectionList.elementAt(i)).getAttribute("background");
      if (sSrc!=null) {
        if (sSrc.length()>0) {
          sSrc = sSrc.replace('\\','/');

      if (DebugFile.trace) DebugFile.writeln("Processing background location "+sSrc);

          // Keep a reference to every related image name so that the same image is not included twice in the message
          if (!oImgs.containsKey(sSrc)) {

            // Find last slash from image url
            iSlash = sSrc.lastIndexOf('/');

            // Take image name
            if (iSlash>=0) {
              while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
                sCid = sSrc.substring(iSlash);
            } // fi
            else {
              sCid = sSrc;
            }

            if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

            oImgs.put(sSrc, sCid);
          } // fi (!oImgs.containsKey(sSrc))
     
      sBodyCid = doSubstitution (sBodyCid, "Background", Gadgets.replace(Gadgets.replace(((TableTag) oCollectionList.elementAt(i)).getAttribute("background"),'\\',"\\\\"),'.',"\\x2E"), sPreffix+oImgs.get(sSrc));

        } // fi
      } // fi
    } // next

  // **********************************************************************
  // Replace <TD BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTDFilter = new TagNameFilter("TD");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTDFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("TD NodeList.size() = " + String.valueOf(nImgs));
View Full Code Here

      DebugFile.writeln("Begin HtmlMimeBodyPart.removePreffixFromImgSrcs("+sPreffix+")");
      DebugFile.incIdent();
    }

    int iSlash;
    Parser oPrsr;
    String sCid, sSrc;
    String sBodyCid = sBody;
    NodeList oCollectionList;
    TagNameFilter oImgFilter;

  // **********************************************************************
  // Replace <IMG SRC="..." >

    oPrsr = Parser.createParser(sBodyCid, sEnc);
   
    oCollectionList = new NodeList();
    oImgFilter = new TagNameFilter ("IMG");
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oImgFilter);

    int nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Images NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

        sSrc = (((ImageTag) oCollectionList.elementAt(i)).extractImageLocn()).replace('\\','/');

        // Keep a reference to every related image name so that the same image is not included twice in the message
        if (!oImgs.containsKey(sSrc)) {

          // Find last slash from image url
          iSlash = sSrc.lastIndexOf('/');
     
          // Take image name
          if (iSlash>=0) {
            while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
              sCid = sSrc.substring(iSlash);
          }
          else {
            sCid = sSrc;
          }

          // String sUid = Gadgets.generateUUID();
          // sCid = sUid.substring(0,12)+"$"+sUid.substring(12,20)+"$"+sUid.substring(20,28)+"@hipergate.org";

          if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

          oImgs.put(sSrc, sCid);
        } // fi (!oImgs.containsKey(sSrc))
       
        String sImgSrc = ((ImageTag) oCollectionList.elementAt(i)).extractImageLocn();
        if (sImgSrc.startsWith(sPreffix)) {
          sBodyCid = doSubstitution(sBodyCid, "Src", Gadgets.replace(Gadgets.replace(sImgSrc,'\\',"\\\\"),'.',"\\x2E"), sImgSrc.substring(sPreffix.length()));
        }
       
    } // next

  // **********************************************************************
  // Replace <TABLE BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTableFilter = new TagNameFilter("TABLE");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTableFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("Tables NodeList.size() = " + String.valueOf(nImgs));

    for (int i=0; i<nImgs; i++) {

      sSrc = ((TableTag) oCollectionList.elementAt(i)).getAttribute("background");
      if (sSrc!=null) {
        if (sSrc.length()>0) {
          sSrc = sSrc.replace('\\','/');

          // Keep a reference to every related image name so that the same image is not included twice in the message
          if (!oImgs.containsKey(sSrc)) {

            // Find last slash from image url
            iSlash = sSrc.lastIndexOf('/');

            // Take image name
            if (iSlash>=0) {
              while (sSrc.charAt(iSlash)=='/') { if (++iSlash==sSrc.length()) break; }
                sCid = sSrc.substring(iSlash);
            } // fi
            else {
              sCid = sSrc;
            }

            if (DebugFile.trace) DebugFile.writeln("HashMap.put("+sSrc+","+sCid+")");

            oImgs.put(sSrc, sCid);
          } // fi (!oImgs.containsKey(sSrc))

          String sBckGrnd = ((TableTag) oCollectionList.elementAt(i)).getAttribute("background");
          if (sBckGrnd.startsWith(sPreffix)) {
            sBodyCid = doSubstitution(sBodyCid, "Background", Gadgets.replace(Gadgets.replace(sBckGrnd,'\\',"\\\\"),'.',"\\x2E"), sBckGrnd.substring(sPreffix.length()));
          }         
         
        } // fi
      } // fi
    } // next

  // **********************************************************************
  // Replace <TD BACKGROUND="..." >
   
    oCollectionList = new NodeList();
    TagNameFilter oTDFilter = new TagNameFilter("TD");
    oPrsr = Parser.createParser(sBodyCid, sEnc);
    for (NodeIterator e = oPrsr.elements(); e.hasMoreNodes();)
      e.nextNode().collectInto(oCollectionList, oTDFilter);
         
    nImgs = oCollectionList.size();

    if (DebugFile.trace) DebugFile.writeln("TD NodeList.size() = " + String.valueOf(nImgs));
View Full Code Here

                }
            }
        }
    };

    Parser parser = Parser.createParser(sBody, sEnc);
    NodeList list = parser.parse(null);
    list.visitAllNodesWith(linkVisitor);

    if (DebugFile.trace) {
      DebugFile.decIdent();
      DebugFile.writeln("End HtmlMimeBodyPart.addClickThroughRedirector()");
View Full Code Here

        if (DebugFile.trace)
          DebugFile.writeln(xcpt.getClass().getName()+" "+xcpt.getMessage()+" indexing message "+sGuid+" - "+sSubject);
      }
      if (oStrBuff.length()>0) {
        if (Gadgets.indexOfIgnoreCase(oStrBuff.toString(), "<html>")>=0) {
          Parser oPrsr = Parser.createParser(oStrBuff.toString(), null);
          StringBean oStrs = new StringBean();
          try {
            oPrsr.visitAllNodesWith (oStrs);
          } catch (ParserException pe) {
            if (DebugFile.trace) DebugFile.decIdent();
            throw new IOException(pe.getMessage());         
          }
View Full Code Here

      getText(oBuffer);
    }
    else if (getContentType().startsWith("text/html")) {
      StringBuffer oHtmlBuff = new StringBuffer();
      getText(oHtmlBuff);
      Parser oPrsr = Parser.createParser(oHtmlBuff.toString(), getEncoding());
      StringBean oStrBn = new StringBean();
      try {
        oPrsr.visitAllNodesWith (oStrBn);
      } catch (ParserException pe) {
        throw new MessagingException(pe.getMessage(), pe);
      }
      // Code for HTML parser 1.4
      // oStrBn.setInputHTML(oHtmlBuff.toString());
      oBuffer.append(oStrBn.getStrings());
    }
    else {
      if (DebugFile.trace) DebugFile.writeln("Multipart = DBMimeMessage.getParts()");

      Multipart oParts = getParts();

      final int iParts = oParts.getCount();

      MimePart oPart;

      int p;
      for (p=0; p<iParts && !bHasPlainTextVersion; p++) {
        oPart = (MimePart) oParts.getBodyPart(p);

        String sType = oPart.getContentType();
        if (null!=sType) sType=sType.toLowerCase();
        String sDisp = oPart.getDisposition();
        if (null==sDisp) sDisp="inline"; else if (sDisp.length()==0) sDisp="inline";

        if (DebugFile.trace) DebugFile.writeln("scanning part " + String.valueOf(p) + sDisp + " " + sType.replace('\r',' ').replace('\n', ' '));

        if (sType.startsWith("text/plain") && sDisp.equalsIgnoreCase("inline")) {
          bHasPlainTextVersion = true;
          DBMimePart.parseMimePart (oBuffer, null,
                                    getFolder().getName(),
                                    getMessageID()!=null ? getMessageID() : getContentID(),
                                    oPart, p);
        }
      }

      if (DebugFile.trace) {
        if (bHasPlainTextVersion)
          DebugFile.writeln("MimeMultipart has plain text version at part " + String.valueOf(p));
        else
          DebugFile.writeln("MimeMultipart has no plain text version, converting part 0 from HTML");
      }

      if (!bHasPlainTextVersion) {
        oPart = (MimePart) oParts.getBodyPart(0);
        StringBuffer oHtml = new StringBuffer();
        DBMimePart.parseMimePart (oHtml, null, getFolder().getName(), getMessageID()!=null ? getMessageID() : getContentID(), oPart, 0);

        Parser oPrsr = Parser.createParser(oHtml.toString(), getEncoding());
        StringBean oStrBn = new StringBean();

        try {
          oPrsr.visitAllNodesWith (oStrBn);
        } catch (ParserException pe) {
          throw new MessagingException(pe.getMessage(), pe);
        }

        // Code for HTML parser 1.4
View Full Code Here

      // ************************************************************************
      // Replace image CIDs

      String sSrc, sCid, sText = "";

      Parser oPrsr = Parser.createParser(sBody, getEncoding());

      // String sCid, sSrc;

      HtmlMimeBodyPart oHtmBdy = new HtmlMimeBodyPart(sBody, getEncoding());

      try {

        // ****************************
        // Extract plain text from HTML
        if (DebugFile.trace) DebugFile.writeln("new StringBean()");

        StringBean oStrBn = new StringBean();

        try {
          oPrsr.visitAllNodesWith (oStrBn);
        } catch (ParserException pe) {
          if (DebugFile.trace) {
            DebugFile.writeln("org.htmlparser.util.ParserException " + pe.getMessage());
          }
          throw new MessagingException(pe.getMessage(), pe);
View Full Code Here

  public static String removeAbusiveTags(String htmlCode){
    if(htmlCode == null){
      throw new NullPointerException("Input HTML code string is NULL");
    }   
    try{
      Parser parser = Parser.createParser(htmlCode, "UTF-8");
      final NodeList nl = parser.parse(null);
      nl.visitAllNodesWith(new NodeVisitor(){
        @SuppressWarnings("unchecked")
        public void visitTag(Tag tag){
          if((tag instanceof ScriptTag) ||
             (tag instanceof FrameTag)){
View Full Code Here

  public static String extractTextFromHtml(String htmlCode){
    if(htmlCode == null){
      throw new NullPointerException("Input HTML code string is NULL");
    }
   
    Parser parser = Parser.createParser(htmlCode,"UTF-8");
    TextExtractingVisitor visitor = new TextExtractingVisitor();
    try {
      parser.visitAllNodesWith(visitor);
      return visitor.getExtractedText();
    } catch (ParserException e) {
      logger.debug("HTML parsing error: " + htmlCode, e);
    }
    return "";
View Full Code Here

    }

    // Using HTMLParser to extract the content
    String cleanedContent = null;
    Page htmlPage = new Page(cuttedContent, "UTF-8");
    Parser parser = new Parser(new Lexer(htmlPage));
    StringBean stringBean = new StringBean();

    // replace multiple whitespace with one whitespace
    stringBean.setCollapse(true);
    // Do not extract URLs
    stringBean.setLinks(false);
    // replace &nbsp; with whitespace
    stringBean.setReplaceNonBreakingSpaces(true);

    try {
      // Parse the content
      parser.visitAllNodesWith(stringBean);
      cleanedContent = stringBean.getStrings();

    } catch (ParserException ex) {
      throw new RegainException("Error while parsing content: ", ex);
    }

    // The result of parsing the html-content
    setCleanedContent(cleanedContent);

    // Extract links
    LinkVisitor linkVisitor = new LinkVisitor();
    if (isContentCutted) {
      // This means a new parser run which is expensive but neccessary
      htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
      parser = new Parser(new Lexer(htmlPage));
    } else {
      parser.reset();
    }

    try {
      // Parse the content
      parser.visitAllNodesWith(linkVisitor);
      ArrayList<Tag> links = linkVisitor.getLinks();
      htmlPage.setBaseUrl(rawDocument.getUrl());

      // Iterate over all links found
      Iterator linksIter = links.iterator();
View Full Code Here

TOP

Related Classes of org.htmlparser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.