Examples of com.substanceofcode.utils.HTMLParser

com.substanceofcode.utils.HTMLParser
Simple and lightweight HTML parser without complete error handling. @author Irving Bunton

    throws IOException, CauseMemoryException, CauseException, Exception {
        /** Initialize item collection */
        Vector rssFeeds = new Vector();
        
        /** Initialize XML parser and parse OPML XML */
        HTMLParser parser = new HTMLParser(encodingUtil);
        try {
            
      // The first element is the main tag.
            int elementType = parser.parse();
      // If we found the prologue, get the next entry.
      if( elementType == XmlParser.PROLOGUE ) {
        elementType = parser.parse();
      }
      if (elementType == XmlParser.END_DOCUMENT ) {
        return null;
      }
            
      boolean bodyFound = false;
            do {
        if (elementType == HTMLParser.REDIRECT_URL) {
          RssItunesFeed [] feeds = new RssItunesFeed[1];
          feeds[0] = new RssItunesFeed("", parser.getRedirectUrl(),
              "", "");
          return feeds;
        }
        /** RSS item properties */
        String title = "";
        String link = "";
                        
        String tagName = parser.getName();
        //#ifdef DLOGGING
        if (finerLoggable) {logger.finer("tagname: " + tagName);}
        //#endif
        if (tagName.length() == 0) {
          continue;
        }
        switch (tagName.charAt(0)) {
          case 'm':
          case 'M':
            if (bodyFound) {
              break;
            }
            break;
          case 'b':
          case 'B':
            if (!bodyFound) {
              bodyFound = parser.isBodyFound();
            }
            break;
          case 'a':
          case 'A':
            //#ifdef DLOGGING
            if (finerLoggable) {logger.finer("Parsing <a> tag");}
            //#endif
            
            title = parser.getText();
            // Title can be 0 as this is used also for
            // getting 
            title = title.trim();
            title = StringUtil.removeHtml( title );


            if (((link = parser.getAttributeValue( "href" ))
                  == null) || ( link.length() == 0 )) {
              continue;
            }
            link = link.trim();
            if ( link.length() == 0 ) {
              continue;
            }
            if (link.indexOf("://") >= 0) {
              if (!link.startsWith("http:") &&
                !link.startsWith("https:") &&
                !link.startsWith("file:") &&
                 !link.startsWith("jar:")) {
                //#ifdef DLOGGING
                if (finerLoggable) {logger.finer("Not support for protocol or no protocol=" + link);}
                //#endif
                continue;
              }
            } else {
              if (link.charAt(0) == '/') {
                int purl = url.indexOf("://");
                if ((purl + 4) >= url.length()) {
                  //#ifdef DLOGGING
                  if (finerLoggable) {logger.finer("Url too short=" + url + "," + purl);}
                  //#endif
                  continue;
                }
                int pslash = url.indexOf("/", purl + 3);
                String burl = url;
                if (pslash >= 0) {
                  burl = url.substring(0, pslash);
                }
                link = burl + link;
              } else {
                link = url + "/" + link;
              }
            }
            
            /** Debugging information */
            //#ifdef DLOGGING
            if (finerLoggable) {logger.finer("Title:       " + title);}
            if (finerLoggable) {logger.finer("Link:        " + link);}
            //#endif
            if (( feedURLFilter != null) &&
              ( link.toLowerCase().indexOf(feedURLFilter) < 0)) {
              continue;
            }
            
            if (( feedNameFilter != null) &&
              ((title != null) &&
              (title.toLowerCase().indexOf(feedNameFilter) < 0))) {
              continue;
            }
            RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
            rssFeeds.addElement( feed );
            break;
          default:
        }
            }
            while( (elementType = parser.parse()) != XmlParser.END_DOCUMENT );
            
        } catch (CauseMemoryException ex) {
      CauseMemoryException cex = new CauseMemoryException(
          "Out of memory error while parsing HTML Link feed " + url,
          ex);

View Full Code Here

    throws IOException, CauseMemoryException, CauseException, Exception {
        /** Initialize item collection */
        Vector rssFeeds = new Vector();
        
        /** Initialize XML parser and parse OPML XML */
        HTMLParser parser = new HTMLParser(encodingUtil);
        try {
            
      // The first element is the main tag.
            int elementType = parser.parse();
      // If we found the prologue, get the next entry.
      if( elementType == XmlParser.PROLOGUE ) {
        elementType = parser.parse();
      }
      if (elementType == XmlParser.END_DOCUMENT ) {
        return null;
      }
            
      boolean windows = parser.isWindows();
      boolean utf = parser.isUtf();
      boolean process = true;
      boolean bodyFound = false;
            do {
        /** RSS item properties */
        String title = "";
        String link = "";
                        
        String tagName = parser.getName();
        //#ifdef DLOGGING
        if (finerLoggable) {logger.finer("tagname: " + tagName);}
        //#endif
        switch (tagName.charAt(0)) {
          case 'b':
          case 'B':
            if (bodyFound) {
              continue;
            }
            bodyFound = parser.isBodyFound();
            if (bodyFound) {
              windows = parser.isWindows();
              utf = parser.isUtf();
            }
            // If looking for OPML link, it is in header.
            if ((!needRss || needFirstRss) && bodyFound) {
              process = false;
              break;
            }
            break;
          case 'l':
          case 'L':
            if (!tagName.toLowerCase().equals("link")) {
              break;
            }
            //#ifdef DLOGGING
            if (finerLoggable) {logger.finer("Parsing <link> tag");}
            //#endif
            
            // TODO base
            String type = parser.getAttributeValue( "type" );
            if (type == null) {
              continue;
            }
            if (!needRss && (type.toLowerCase().indexOf("opml") < 0)) {
              continue;
            }
            if (needRss &&
                ((type.toLowerCase().indexOf("rss") < 0) &&
                (type.toLowerCase().indexOf("atom") < 0))) {
              continue;
            }
            title = parser.getAttributeValue( "title" );
            // Allow null title so that the caller can
            // check if it needs to get the title another way.
            if (title != null) {
              title = EncodingUtil.replaceAlphaEntities(true,
                  title);
              title = EncodingUtil.replaceNumEntity(title);
              // Replace special chars like left quote, etc.
              // Since we have already converted to unicode, we want
              // to replace with uni chars.
              title = encodingUtil.replaceSpChars(title);


              title = StringUtil.removeHtml(title);
            }
            if (((link = parser.getAttributeValue( "href" ))
                  == null) || ( link.length() == 0 )) {
              continue;
            }
            if (link.charAt(0) == '/') {
              link = url + link;
            }
            
            /** Debugging information */
            System.out.println("Title:       " + title);
            System.out.println("Link:        " + link);
            
            /** 
             * Create new RSS item and add it do RSS document's item
             * collection.  Account for wrong OPML which is an
             * OPML composed of other OPML.  These have url attribute
             * instead of link attribute.
             */
            if (!needRss || needFirstRss) {
              RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
              rssFeeds.addElement( feed );
              process = false;
              break;
            }
            if (( feedURLFilter != null) &&
              ( link.toLowerCase().indexOf(feedURLFilter) < 0)) {
              continue;
            }
            if (( feedNameFilter != null) &&
              ((title != null) &&
              (title.toLowerCase().indexOf(feedNameFilter) < 0))) {
              continue;
            }
            RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
            rssFeeds.addElement( feed );
            break;
          default:
        }
      }
            while( process && (parser.parse() != XmlParser.END_DOCUMENT) );
            
        } catch (CauseMemoryException ex) {
      CauseMemoryException cex = new CauseMemoryException(
          "Out of memory error while parsing HTML auto link feed " +
          url, ex);

View Full Code Here

    throws IOException, CauseMemoryException, CauseException, Exception {
        /** Initialize item collection */
        Vector rssFeeds = new Vector();
        
        /** Initialize XML parser and parse OPML XML */
        HTMLParser parser = new HTMLParser(encodingUtil);
        try {
            
      // The first element is the main tag.
            int elementType = parser.parse();
      // If we found the prologue, get the next entry.
      if( elementType == XmlParser.PROLOGUE ) {
        elementType = parser.parse();
      }
      if (elementType == XmlParser.END_DOCUMENT ) {
        return null;
      }
            
      boolean windows = parser.isWindows();
      boolean utf = parser.isUtf();
      boolean process = true;
      boolean bodyFound = false;
            do {
        /** RSS item properties */
        String title = "";
        String link = "";
                        
        String tagName = parser.getName();
        //#ifdef DLOGGING
//@        if (finerLoggable) {logger.finer("tagname: " + tagName);}
        //#endif
        switch (tagName.charAt(0)) {
          case 'b':
          case 'B':
            if (bodyFound) {
              continue;
            }
            bodyFound = parser.isBodyFound();
            if (bodyFound) {
              windows = parser.isWindows();
              utf = parser.isUtf();
            }
            // If looking for OPML link, it is in header.
            if ((!needRss || needFirstRss) && bodyFound) {
              process = false;
              break;
            }
            break;
          case 'l':
          case 'L':
            if (!tagName.toLowerCase().equals("link")) {
              break;
            }
            //#ifdef DLOGGING
//@            if (finerLoggable) {logger.finer("Parsing <link> tag");}
            //#endif
            
            // TODO base
            String type = parser.getAttributeValue( "type" );
            if (type == null) {
              continue;
            }
            if (!needRss && (type.toLowerCase().indexOf("opml") < 0)) {
              continue;
            }
            if (needRss &&
                ((type.toLowerCase().indexOf("rss") < 0) &&
                (type.toLowerCase().indexOf("atom") < 0))) {
              continue;
            }
            title = parser.getAttributeValue( "title" );
            // Allow null title so that the caller can
            // check if it needs to get the title another way.
            if (title != null) {
              title = EncodingUtil.replaceAlphaEntities(true,
                  title);
              title = EncodingUtil.replaceNumEntity(title);
              // Replace special chars like left quote, etc.
              // Since we have already converted to unicode, we want
              // to replace with uni chars.
              title = encodingUtil.replaceSpChars(title);


              title = StringUtil.removeHtml(title);
            }
            if (((link = parser.getAttributeValue( "href" ))
                  == null) || ( link.length() == 0 )) {
              continue;
            }
            if (link.charAt(0) == '/') {
              link = url + link;
            }
            
            /** Debugging information */
            System.out.println("Title:       " + title);
            System.out.println("Link:        " + link);
            
            /** 
             * Create new RSS item and add it do RSS document's item
             * collection.  Account for wrong OPML which is an
             * OPML composed of other OPML.  These have url attribute
             * instead of link attribute.
             */
            if (!needRss || needFirstRss) {
              RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
              rssFeeds.addElement( feed );
              process = false;
              break;
            }
            if (( feedURLFilter != null) &&
              ( link.toLowerCase().indexOf(feedURLFilter) < 0)) {
              continue;
            }
            if (( feedNameFilter != null) &&
              ((title != null) &&
              (title.toLowerCase().indexOf(feedNameFilter) < 0))) {
              continue;
            }
            RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
            rssFeeds.addElement( feed );
            break;
          default:
        }
      }
            while( process && (parser.parse() != XmlParser.END_DOCUMENT) );
            
        } catch (CauseMemoryException ex) {
      CauseMemoryException cex = new CauseMemoryException(
          "Out of memory error while parsing HTML auto link feed " +
          url, ex);

View Full Code Here

    throws IOException, CauseMemoryException, CauseException, Exception {
        /** Initialize item collection */
        Vector rssFeeds = new Vector();
        
        /** Initialize XML parser and parse OPML XML */
        HTMLParser parser = new HTMLParser(encodingUtil);
        try {
            
      // The first element is the main tag.
            int elementType = parser.parse();
      // If we found the prologue, get the next entry.
      if( elementType == XmlParser.PROLOGUE ) {
        elementType = parser.parse();
      }
      if (elementType == XmlParser.END_DOCUMENT ) {
        return null;
      }
            
      boolean bodyFound = false;
            do {
        if (elementType == HTMLParser.REDIRECT_URL) {
          RssItunesFeed [] feeds = new RssItunesFeed[1];
          feeds[0] = new RssItunesFeed("", parser.getRedirectUrl(),
              "", "");
          return feeds;
        }
        /** RSS item properties */
        String title = "";
        String link = "";
                        
        String tagName = parser.getName();
        //#ifdef DLOGGING
//@        if (finerLoggable) {logger.finer("tagname: " + tagName);}
        //#endif
        if (tagName.length() == 0) {
          continue;
        }
        switch (tagName.charAt(0)) {
          case 'm':
          case 'M':
            if (bodyFound) {
              break;
            }
            break;
          case 'b':
          case 'B':
            if (!bodyFound) {
              bodyFound = parser.isBodyFound();
            }
            break;
          case 'a':
          case 'A':
            //#ifdef DLOGGING
//@            if (finerLoggable) {logger.finer("Parsing <a> tag");}
            //#endif
            
            title = parser.getText();
            // Title can be 0 as this is used also for
            // getting 
            title = title.trim();
            title = StringUtil.removeHtml( title );


            if (((link = parser.getAttributeValue( "href" ))
                  == null) || ( link.length() == 0 )) {
              continue;
            }
            link = link.trim();
            if ( link.length() == 0 ) {
              continue;
            }
            if (link.indexOf("://") >= 0) {
              if (!link.startsWith("http:") &&
                !link.startsWith("https:") &&
                !link.startsWith("file:") &&
                 !link.startsWith("jar:")) {
                //#ifdef DLOGGING
//@                if (finerLoggable) {logger.finer("Not support for protocol or no protocol=" + link);}
                //#endif
                continue;
              }
            } else {
              if (link.charAt(0) == '/') {
                int purl = url.indexOf("://");
                if ((purl + 4) >= url.length()) {
                  //#ifdef DLOGGING
//@                  if (finerLoggable) {logger.finer("Url too short=" + url + "," + purl);}
                  //#endif
                  continue;
                }
                int pslash = url.indexOf("/", purl + 3);
                String burl = url;
                if (pslash >= 0) {
                  burl = url.substring(0, pslash);
                }
                link = burl + link;
              } else {
                link = url + "/" + link;
              }
            }
            
            /** Debugging information */
            //#ifdef DLOGGING
//@            if (finerLoggable) {logger.finer("Title:       " + title);}
//@            if (finerLoggable) {logger.finer("Link:        " + link);}
            //#endif
            if (( feedURLFilter != null) &&
              ( link.toLowerCase().indexOf(feedURLFilter) < 0)) {
              continue;
            }
            
            if (( feedNameFilter != null) &&
              ((title != null) &&
              (title.toLowerCase().indexOf(feedNameFilter) < 0))) {
              continue;
            }
            RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
            rssFeeds.addElement( feed );
            break;
          default:
        }
            }
            while( (elementType = parser.parse()) != XmlParser.END_DOCUMENT );
            
        } catch (CauseMemoryException ex) {
      CauseMemoryException cex = new CauseMemoryException(
          "Out of memory error while parsing HTML Link feed " + url,
          ex);

View Full Code Here

TOP

Related Classes of com.substanceofcode.utils.HTMLParser

com.substanceofcode.rssreader.businesslogic.HTMLAutoLinkParser

com.substanceofcode.rssreader.businesslogic.HTMLLinkParser

com.substanceofcode.utils.CauseMemoryException

com.substanceofcode.utils.CauseException

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.