* HTMLAutoLinkParser.java
* Copyright (C) 2007-2008 Tommi Laukkanen
* Copyright (C) 2007-2008 Irving Bunton
* http://www.substanceofcode.com
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Expand to define memory size define
// Expand to define logging define
//#define DNOLOGGING
/* This functionality adds to jar size, so don't do it for small memory */
/* devices. */
//#ifndef DSMALLMEM
package com.substanceofcode.rssreader.businesslogic;
import com.substanceofcode.rssreader.businessentities.RssItunesFeed;
import com.substanceofcode.utils.HTMLParser;
import com.substanceofcode.utils.XmlParser;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Vector;
import com.substanceofcode.utils.EncodingUtil;
import com.substanceofcode.utils.StringUtil;
import com.substanceofcode.utils.CauseException;
import com.substanceofcode.utils.CauseMemoryException;
//#ifdef DLOGGING
//@import net.sf.jlogmicro.util.logging.Logger;
//@import net.sf.jlogmicro.util.logging.LogManager;
//@import net.sf.jlogmicro.util.logging.Level;
* HTMLAutoLinkParser class is used when we are parsing RSS feed list
* using HTML autolinks <link href="link" title="Name"/>.
* These have type with application/atom or rss.
* @author Irving Bunton
public class HTMLAutoLinkParser extends FeedListParser {
boolean m_needRss = true;
boolean m_needFirstRss = false;
//#ifdef DLOGGING
//@ private Logger logger = Logger.getLogger("HTMLAutoLinkParser");
//@ private boolean fineLoggable = logger.isLoggable(Level.FINE);
//@ private boolean finerLoggable = logger.isLoggable(Level.FINER);
//@ private boolean finestLoggable = logger.isLoggable(Level.FINEST);
/** Creates a new instance of HTMLAutoLinkParser */
public HTMLAutoLinkParser(String url, String username, String password) {
super(url, username, password);
public RssItunesFeed[] parseFeeds(InputStream is)
throws IOException, CauseMemoryException, CauseException, Exception {
// Init in case we get a severe error.
try {
return HTMLAutoLinkParser.parseFeeds(new EncodingUtil(is),
//#ifdef DLOGGING
//@ ,logger
//@ ,fineLoggable
//@ ,finerLoggable
//@ ,finestLoggable
} catch (CauseException ex) {
throw ex;
} catch (Throwable t) {
CauseException cex = new CauseException(
"Error while parsing HTML auto link feed " + m_url, t);
//#ifdef DLOGGING
//@ logger.severe(cex.getMessage(), cex);
System.out.println(cex.getMessage() + " " + t + " " + t.getMessage());
throw cex;
// Parse feeds. Allow null title.
static public RssItunesFeed[] parseFeeds(EncodingUtil encodingUtil,
String url,
boolean needRss,
boolean needFirstRss,
String feedNameFilter,
String feedURLFilter
//#ifdef DLOGGING
//@ ,Logger logger,
//@ boolean fineLoggable,
//@ boolean finerLoggable,
//@ boolean finestLoggable
throws IOException, CauseMemoryException, CauseException, Exception {
/** Initialize item collection */
Vector rssFeeds = new Vector();
/** Initialize XML parser and parse OPML XML */
HTMLParser parser = new HTMLParser(encodingUtil);
try {
// The first element is the main tag.
int elementType = parser.parse();
// If we found the prologue, get the next entry.
if( elementType == XmlParser.PROLOGUE ) {
elementType = parser.parse();
if (elementType == XmlParser.END_DOCUMENT ) {
return null;
boolean windows = parser.isWindows();
boolean utf = parser.isUtf();
boolean process = true;
boolean bodyFound = false;
do {
/** RSS item properties */
String title = "";
String link = "";
String tagName = parser.getName();
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("tagname: " + tagName);}
switch (tagName.charAt(0)) {
case 'b':
case 'B':
if (bodyFound) {
bodyFound = parser.isBodyFound();
if (bodyFound) {
windows = parser.isWindows();
utf = parser.isUtf();
// If looking for OPML link, it is in header.
if ((!needRss || needFirstRss) && bodyFound) {
process = false;
case 'l':
case 'L':
if (!tagName.toLowerCase().equals("link")) {
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("Parsing <link> tag");}
// TODO base
String type = parser.getAttributeValue( "type" );
if (type == null) {
if (!needRss && (type.toLowerCase().indexOf("opml") < 0)) {
if (needRss &&
((type.toLowerCase().indexOf("rss") < 0) &&
(type.toLowerCase().indexOf("atom") < 0))) {
title = parser.getAttributeValue( "title" );
// Allow null title so that the caller can
// check if it needs to get the title another way.
if (title != null) {
title = EncodingUtil.replaceAlphaEntities(true,
title = EncodingUtil.replaceNumEntity(title);
// Replace special chars like left quote, etc.
// Since we have already converted to unicode, we want
// to replace with uni chars.
title = encodingUtil.replaceSpChars(title);
title = StringUtil.removeHtml(title);
if (((link = parser.getAttributeValue( "href" ))
== null) || ( link.length() == 0 )) {
if (link.charAt(0) == '/') {
link = url + link;
/** Debugging information */
System.out.println("Title: " + title);
System.out.println("Link: " + link);
* Create new RSS item and add it do RSS document's item
* collection. Account for wrong OPML which is an
* OPML composed of other OPML. These have url attribute
* instead of link attribute.
if (!needRss || needFirstRss) {
RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
rssFeeds.addElement( feed );
process = false;
if (( feedURLFilter != null) &&
( link.toLowerCase().indexOf(feedURLFilter) < 0)) {
if (( feedNameFilter != null) &&
((title != null) &&
(title.toLowerCase().indexOf(feedNameFilter) < 0))) {
RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
rssFeeds.addElement( feed );
while( process && (parser.parse() != XmlParser.END_DOCUMENT) );
} catch (CauseMemoryException ex) {
CauseMemoryException cex = new CauseMemoryException(
"Out of memory error while parsing HTML auto link feed " +
url, ex);
throw cex;
} catch (Exception ex) {
CauseException cex = new CauseException(
"Error while parsing HTML auto link feed " + url, ex);
//#ifdef DLOGGING
//@ logger.severe(cex.getMessage(), cex);
System.err.println(cex.getMessage() + " " + ex + " " + ex.getMessage());
throw cex;
} catch (Throwable t) {
CauseException cex = new CauseException(
"Error while parsing HTML auto link feed " + url, t);
//#ifdef DLOGGING
//@ logger.severe(cex.getMessage(), cex);
System.err.println(cex.getMessage() + " " + t + " " + t.getMessage());
throw cex;
/** Create array */
RssItunesFeed[] feeds = new RssItunesFeed[ rssFeeds.size() ];
if (feeds.length > 0) {
return feeds;
public void setNeedRss(boolean needRss) {
this.m_needRss = needRss;
public boolean isNeedRss() {
return (m_needRss);