// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2006 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: RomeFeedParser.java,v 1.25 2008/06/26 13:41:57 spyromus Exp $
//
package com.salas.bb.utils.parser;
import com.salas.bb.networking.manager.NetManager;
import com.salas.bb.utils.Constants;
import com.salas.bb.utils.StringUtils;
import com.salas.bb.utils.i18n.Strings;
import com.salas.bb.utils.net.IPermanentRedirectionListener;
import com.salas.bb.utils.net.URLInputStream;
import com.salas.bb.utils.parser.impl.BBSyndFeedInput;
import com.salas.bb.utils.xml.XmlReaderFactory;
import com.sun.syndication.feed.module.DCModule;
import com.sun.syndication.feed.module.DCSubject;
import com.sun.syndication.feed.module.Module;
import com.sun.syndication.feed.module.SyModule;
import com.sun.syndication.feed.synd.*;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import com.totsp.xml.syndication.content.ContentModule;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* Gateway to Rome parser.
*/
public class RomeFeedParser implements IFeedParser
{
private static final List CONTENT_TYPE_PREFERENCE;
private static final String TYPE_HTML = "html";
private static final String TYPE_TEXT_HTML = "text/html";
static
{
CONTENT_TYPE_PREFERENCE = Arrays.asList(TYPE_TEXT_HTML, TYPE_HTML, "text/plain", "text", "text/xhtml", "xhtml");
}
/**
* Parses the resource by the given URL and returns the objects.
*
* @param xmlURL XML URL of the resource.
* @param title feed title (if known).
* @param lastUpdateTime time of last update (server time-zone) or (-1) if not known.
*
* @return result.
*
* @throws FeedParserException
* in case of any problems with parsing.
* @throws NullPointerException if the URL is NULL.
* @throws java.io.IOException if there's a problem with reading feed.
*/
public FeedParserResult parse(URL xmlURL, String title, long lastUpdateTime)
throws FeedParserException, IOException
{
if (xmlURL == null) throw new NullPointerException(Strings.error("unspecified.url"));
FeedParserResult result = new FeedParserResult();
String xmlURLS = xmlURL.toString();
String username = null;
String password = null;
Pattern pattern = Pattern.compile("^(https?://)([^:]+):([^@]+)@(.+)$", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(xmlURLS);
if (matcher.find())
{
username = matcher.group(2);
password = matcher.group(3);
xmlURL = new URL(matcher.group(1) + matcher.group(4));
}
// Create stream for reading the feed and register it
URLInputStream stream = new URLInputStream(xmlURL, lastUpdateTime);
stream.setBasicAuthenticationInfo(username, password);
if (title == null) title = xmlURL.toString();
NetManager.register(NetManager.TYPE_POLLING, title, title, stream);
stream.setRedirectionListener(new RomeFeedParser.RedirectionRecorder(result));
stream.connect();
try
{
long lastModifiedTime = stream.getLastModifiedTime();
if (lastModifiedTime == -1) lastModifiedTime = stream.getServerTime();
if (stream.getResponseCode() != HttpURLConnection.HTTP_NOT_MODIFIED)
{
result = parse(stream, result, xmlURL);
}
Channel channel = result.getChannel();
if (channel != null) channel.setLastUpdateServerTime(lastModifiedTime);
} finally
{
stream.close();
}
return result;
}
/**
* Parses the resource presented by a stream and returns the objects.
*
* @param stream XML stream.
* @param rootURL URL for the relative links resolution.
*
* @return result.
*
* @throws FeedParserException in case of any problems with parsing.
* @throws NullPointerException if the URL is NULL.
* @throws java.io.IOException if there's a problem with reading feed.
*/
public FeedParserResult parse(InputStream stream, URL rootURL)
throws IOException, FeedParserException
{
return parse(stream, new FeedParserResult(), rootURL);
}
/**
* Parses the resource by the given stream.
*
* @param aStream stream to parse as feed.
* @param aResult object with result to fill.
* @param aFeedURL root URL of a feed for the relative links resolution.
*
* @return result.
*
* @throws FeedParserException in case of any problems with parsing.
* @throws IOException if there's a problem with reading feed.
*/
protected FeedParserResult parse(InputStream aStream, FeedParserResult aResult, URL aFeedURL)
throws IOException, FeedParserException
{
try
{
SyndFeedInput input = new BBSyndFeedInput();
SyndFeed feed = input.build(XmlReaderFactory.create(aStream));
Channel channel = RomeFeedParser.convertFeed(feed, aFeedURL);
aResult.setChannel(channel);
// Add items
for (SyndEntry item : (List<SyndEntry>)feed.getEntries())
{
channel.addItem(RomeFeedParser.convertItem(item, aFeedURL));
}
} catch (FeedException e)
{
throw new FeedParserException(Strings.error("failed.to.parse.the.feed"), e);
}
return aResult;
}
/**
* Converts feed object into internal format.
*
* @param aFeed source feed object.
* @param aFeedURL root URL of a feed for the relative links resolution.
*
* @return internal object.
*
* @throws MalformedURLException if URL is not valid.
*/
private static Channel convertFeed(SyndFeed aFeed, URL aFeedURL)
throws MalformedURLException
{
Channel channel = new Channel();
channel.setAuthor(aFeed.getAuthor());
channel.setDescription(aFeed.getDescription());
channel.setFormat(aFeed.getFeedType());
channel.setLanguage(aFeed.getLanguage());
channel.setSiteURL(StringUtils.isEmpty(aFeed.getLink()) ? null
: new URL(aFeedURL, StringUtils.fixURL(aFeed.getLink())));
channel.setTitle(aFeed.getTitle());
long period = getUpdatePeriod(aFeed);
if (period != -1)
{
int updateFrequency = getUpdateFrequency(aFeed);
if (updateFrequency > 1) period = period / updateFrequency;
}
channel.setUpdatePeriod(period);
return channel;
}
/**
* Returns update frequency of the feed in times.
*
* @param aFeed feed.
*
* @return frequency.
*/
private static int getUpdateFrequency(SyndFeed aFeed)
{
SyModule module = (SyModule)aFeed.getModule(SyModule.URI);
return module == null ? -1 : module.getUpdateFrequency();
}
/**
* Returns update period in milliseconds.
*
* @param aFeed feed.
*
* @return period in ms or <code>-1</code> if not specified.
*/
private static long getUpdatePeriod(SyndFeed aFeed)
{
SyModule module = (SyModule)aFeed.getModule(SyModule.URI);
return module == null ? -1 : periodToValue(module.getUpdatePeriod());
}
/**
* Converts the name of period to corresponding value.
*
* @param periodName period name.
*
* @return value in ms or -1 if period name isn't known or NULL.
*/
private static long periodToValue(String periodName)
{
long period = -1;
if (SyModule.YEARLY.equalsIgnoreCase(periodName))
{
period = Constants.MILLIS_IN_YEAR;
} else if (SyModule.MONTHLY.equalsIgnoreCase(periodName))
{
period = Constants.MILLIS_IN_MONTH;
} else if (SyModule.WEEKLY.equalsIgnoreCase(periodName))
{
period = Constants.MILLIS_IN_WEEK;
} else if (SyModule.HOURLY.equalsIgnoreCase(periodName))
{
period = Constants.MILLIS_IN_HOUR;
} else if (SyModule.DAILY.equalsIgnoreCase(periodName))
{
period = Constants.MILLIS_IN_DAY;
}
return period;
}
/**
* Converts item object into internal item format.
*
* @param anEntry source item object.
* @param aFeedURL root URL of a feed for the relative links resolution.
*
* @return internal object.
*/
private static Item convertItem(SyndEntry anEntry, URL aFeedURL)
{
String text = getEntryText(anEntry);
String title = anEntry.getTitle();
if (title != null && title.equals("<No Title>")) title = null;
// Append enclosure to the end of the article
List enclosures = anEntry.getEnclosures();
if (enclosures != null && enclosures.size() > 0)
{
for (Object en : enclosures)
{
SyndEnclosure enclosure = (SyndEnclosure)en;
String location = enclosure.getUrl();
if (location != null)
{
long length = enclosure.getLength();
text += formatEnclosure(location, length);
}
}
} else
{
// Scan links list for possible enclosures.
// Note: We do this in "else" block because the method is
// not very reliable and if there are explicit enclosures
// mention, we'd better not do this.
List links = anEntry.getLinks();
if (links != null) for (Object lnk : links)
{
SyndLink link = (SyndLink)lnk;
String rel = link.getRel();
long length = link.getLength();
String location = link.getHref();
if (length > 0 &&
(StringUtils.isEmpty(rel) || "enclosure".equalsIgnoreCase(rel)) &&
StringUtils.isNotEmpty(location))
{
text += formatEnclosure(location, length);
}
}
}
Item item = new Item(text);
item.setAuthor(anEntry.getAuthor());
URL itemLink;
try
{
String link = anEntry.getLink();
itemLink = link == null ? null : new URL(aFeedURL, link);
} catch (MalformedURLException e)
{
itemLink = null;
}
item.setLink(itemLink);
item.setPublicationDate(anEntry.getPublishedDate());
if (item.getPublicationDate() == null) item.setPublicationDate(anEntry.getUpdatedDate());
item.setTitle(title);
// URI
item.setUri(anEntry.getUri());
// Use subject or categories as subject
String subject = null;
List<SyndCategory> categories = (List<SyndCategory>)anEntry.getCategories();
if (categories != null && !categories.isEmpty())
{
List<String> catsStr = new ArrayList<String>();
for (SyndCategory category : categories)
{
String name = category.getName();
if (StringUtils.isNotEmpty(name)) catsStr.add(name);
}
subject = StringUtils.join(catsStr.iterator(), " ");
} else
{
DCModule dc = (DCModule)anEntry.getModule(DCModule.URI);
if (dc != null)
{
DCSubject dcSubject = dc.getSubject();
if (dcSubject != null) subject = dcSubject.getValue();
}
}
item.setSubject(subject);
return item;
}
/**
* Formats an enclosure URL and length for inclusion in the article text.
*
* @param location location.
* @param length length in bytes.
*
* @return string.
*/
public static String formatEnclosure(String location, long length)
{
String[] linkComponents = location.split("/");
String filename = linkComponents[linkComponents.length - 1];
return "<p id=\"bbenclosure\">" +
"<b>" + Strings.message("feed.parser.enclosure") + "</b> <a href='" + location + "'>" +
filename + "</a>" + (length > 0 ? " (" + StringUtils.sizeToString(length) + ")" : "") +
"</p>";
}
/**
* Returns the text of an entry.
*
* @param anEntry entry.
*
* @return text.
*/
private static String getEntryText(SyndEntry anEntry)
{
String text = null;
// Check if the RSS/RDF content module is present
Module module = anEntry.getModule(ContentModule.URI);
if (module != null)
{
ContentModule cmod = (ContentModule)module;
List encodeds = cmod.getEncodeds();
if (encodeds != null && encodeds.size() > 0)
{
text = (String)encodeds.get(0);
}
}
// If there was no content module, check various content types (Atom)
if (text == null)
{
int type = Integer.MAX_VALUE;
SyndContent content = null;
// Select the best content of all available
List<SyndContent> contents = (List<SyndContent>)anEntry.getContents();
if (contents != null)
{
for (SyndContent cont : contents)
{
int contType = getContentType(cont.getType());
if (contType < type)
{
type = contType;
content = cont;
}
}
}
if (content == null) content = anEntry.getDescription();
if (content != null)
{
String value = content.getValue();
// For some mysterious reason Rome doesn't unescape the HTML and
// Text/HTML content. Do so if necessary.
// Commented out as it seems Rome 0.9 started to unescape feeds.
// if (TYPE_HTML.equals(content.getType()) ||
// TYPE_TEXT_HTML.equals(content.getType()))
// {
// value = StringUtils.quickUnescape(value);
// }
text = value;
}
}
// Check DC module
if (StringUtils.isEmpty(text))
{
DCModule dcModule = (DCModule)anEntry.getModule(DCModule.URI);
if (dcModule != null) text = dcModule.getDescription();
}
if (StringUtils.isEmpty(text)) text = Strings.message("feed.parser.no.text");
return text;
}
/**
* Returns content type preference order.
*
* @param contentType type.
*
* @return order (the lower, the more preferred).
*/
private static int getContentType(String contentType)
{
return contentType == null ? -1
: CONTENT_TYPE_PREFERENCE.indexOf(contentType.toLowerCase());
}
/**
* Listener of permanent redirections notifications. Once the notification comes
* the listener records new URL in the associated result object.
*/
private static class RedirectionRecorder implements IPermanentRedirectionListener
{
private FeedParserResult result;
/**
* Creates redirection recorder for a given result object.
*
* @param aResult result object.
*/
public RedirectionRecorder(FeedParserResult aResult)
{
result = aResult;
}
/**
* Invoked when redirection detected.
*
* @param newLocation new location.
*/
public void redirectedTo(URL newLocation)
{
result.setRedirectionURL(newLocation);
}
}
}