// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2006 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: Atom10ParserV2.java,v 1.1 2007/10/01 17:03:27 spyromus Exp $
//
package com.salas.bb.utils.parser.impl;
import com.sun.syndication.feed.WireFeed;
import com.sun.syndication.feed.atom.*;
import com.sun.syndication.feed.atom.Content;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.impl.Atom10Parser;
import com.sun.syndication.io.impl.DateParser;
import org.jdom.*;
import org.jdom.output.XMLOutputter;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* Custom Atom 1.0 parser fixing URL connection bugs.
*/
public class Atom10ParserV2 extends Atom10Parser
{
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
Namespace ns = Namespace.getNamespace(ATOM_10_URI);
protected WireFeed parseFeed(Element eFeed) throws FeedException
{
com.sun.syndication.feed.atom.Feed feed =
new com.sun.syndication.feed.atom.Feed(getType());
String baseURI;
try
{
baseURI = findBaseURI(eFeed);
} catch (Exception e)
{
throw new FeedException("ERROR while finding base URI of feed", e);
}
String xmlBase = eFeed.getAttributeValue("base", Namespace.XML_NAMESPACE);
if (xmlBase != null)
{
feed.setXmlBase(xmlBase);
}
Element e = eFeed.getChild("title", getAtomNamespace());
if (e != null)
{
Content c = new Content();
c.setValue(parseTextConstructToString(e));
c.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
feed.setTitleEx(c);
}
List<Element> eList = getChildren(eFeed, "link");
feed.setAlternateLinks(parseAlternateLinks(feed, null, baseURI, eList));
feed.setOtherLinks(parseOtherLinks(feed, null, baseURI, eList));
List cList = eFeed.getChildren("category", getAtomNamespace());
feed.setCategories(parseCategories(baseURI, cList));
eList = getChildren(eFeed, "author");
if (eList.size() > 0)
{
feed.setAuthors(parsePersons(baseURI, eList));
}
eList = getChildren(eFeed, "contributor");
if (eList.size() > 0)
{
feed.setContributors(parsePersons(baseURI, eList));
}
e = eFeed.getChild("subtitle", getAtomNamespace());
if (e != null)
{
Content subtitle = new Content();
subtitle.setValue(parseTextConstructToString(e));
subtitle.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
feed.setSubtitle(subtitle);
}
e = eFeed.getChild("id", getAtomNamespace());
if (e != null)
{
feed.setId(e.getText());
}
e = eFeed.getChild("generator", getAtomNamespace());
if (e != null)
{
Generator gen = new Generator();
gen.setValue(e.getText());
String att = e.getAttributeValue("uri");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
gen.setUrl(att);
}
att = e.getAttributeValue("version");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
gen.setVersion(att);
}
feed.setGenerator(gen);
}
e = eFeed.getChild("rights", getAtomNamespace());
if (e != null)
{
feed.setRights(parseTextConstructToString(e));
}
e = eFeed.getChild("icon", getAtomNamespace());
if (e != null)
{
feed.setIcon(e.getText());
}
e = eFeed.getChild("logo", getAtomNamespace());
if (e != null)
{
feed.setLogo(e.getText());
}
e = eFeed.getChild("updated", getAtomNamespace());
if (e != null)
{
feed.setUpdated(DateParser.parseDate(e.getText()));
}
feed.setModules(parseFeedModules(eFeed));
eList = getChildren(eFeed, "entry");
if (eList.size() > 0)
{
feed.setEntries(parseEntries(feed, baseURI, eList));
}
List foreignMarkup =
extractForeignMarkup(eFeed, feed, getAtomNamespace());
if (foreignMarkup.size() > 0)
{
feed.setForeignMarkup(foreignMarkup);
}
return feed;
}
private List<Element> getChildren(Element eFeed, String name)
{
return (List<Element>)eFeed.getChildren(name, getAtomNamespace());
}
private Link parseLink(Feed feed, Entry entry, String baseURI, Element eLink)
{
Link link = new Link();
String att = eLink.getAttributeValue("rel");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
link.setRel(att);
}
att = eLink.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
link.setType(att);
}
att = eLink.getAttributeValue("href");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
if (isRelativeURI(att))
{
link.setHref(resolveURI(baseURI, eLink, att));
} else
{
link.setHref(att);
}
}
att = eLink.getAttributeValue("title");
if (att != null)
{
link.setTitle(att);
}
att = eLink.getAttributeValue("hreflang");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
link.setHreflang(att);
}
att = eLink.getAttributeValue("length");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
link.setLength(Long.parseLong(att));
}
return link;
}
// List(Elements) -> List(Link)
private List<Link> parseAlternateLinks(Feed feed, Entry entry, String baseURI, List<Element> eLinks)
{
List<Link> links = new ArrayList<Link>();
for (Element eLink : eLinks)
{
Link link = parseLink(feed, entry, baseURI, eLink);
if (link.getRel() == null
|| "".equals(link.getRel().trim())
|| "alternate".equals(link.getRel()))
{
links.add(link);
}
}
return (links.size() > 0) ? links : null;
}
private List<Link> parseOtherLinks(Feed feed, Entry entry, String baseURI, List<Element> eLinks)
{
List<Link> links = new ArrayList<Link>();
for (Element eLink : eLinks)
{
Link link = parseLink(feed, entry, baseURI, eLink);
if (!"alternate".equals(link.getRel()))
{
links.add(link);
}
}
return (links.size() > 0) ? links : null;
}
private Person parsePerson(String baseURI, Element ePerson)
{
Person person = new Person();
Element e = ePerson.getChild("name", getAtomNamespace());
if (e != null)
{
person.setName(e.getText());
}
e = ePerson.getChild("uri", getAtomNamespace());
if (e != null)
{
person.setUri(resolveURI(baseURI, ePerson, e.getText()));
}
e = ePerson.getChild("email", getAtomNamespace());
if (e != null)
{
person.setEmail(e.getText());
}
return person;
}
// List(Elements) -> List(Persons)
private List<Person> parsePersons(String baseURI, List<Element> ePersons)
{
List<Person> persons = new ArrayList<Person>();
for (Element ePerson : ePersons) persons.add(parsePerson(baseURI, ePerson));
return (persons.size() > 0) ? persons : null;
}
private Content parseContent(Element e)
{
String value = parseTextConstructToString(e);
String src = e.getAttributeValue("src");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
String type = e.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
Content content = new Content();
content.setSrc(src);
content.setType(type);
content.setValue(value);
return content;
}
private String parseTextConstructToString(Element e)
{
String value;
String type = e.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
type = (type != null) ? type : Content.TEXT;
if (type.equals(Content.XHTML))
{
// XHTML content needs special handling
XMLOutputter outputter = new XMLOutputter();
List<org.jdom.Content> eContent = (List<org.jdom.Content>)e.getContent();
for (org.jdom.Content c : eContent)
{
if (c instanceof Element)
{
Element eC = (Element)c;
if (eC.getNamespace().equals(getAtomNamespace()))
{
((Element)c).setNamespace(Namespace.NO_NAMESPACE);
}
}
}
value = outputter.outputString(eContent);
} else
{
// Everything else comes in verbatim
value = e.getText();
}
return value;
}
// List(Elements) -> List(Entries)
protected List parseEntries(Feed feed, String baseURI, List eEntries)
{
List<Entry> entries = new ArrayList<Entry>();
for (Element eEntry : (List<Element>)eEntries)
{
entries.add(parseEntry(feed, eEntry, baseURI));
}
return (entries.size() > 0) ? entries : null;
}
protected Entry parseEntry(Feed feed, Element eEntry, String baseURI)
{
Entry entry = new Entry();
String xmlBase = eEntry.getAttributeValue("base", Namespace.XML_NAMESPACE);
if (xmlBase != null)
{
entry.setXmlBase(xmlBase);
}
Element e = eEntry.getChild("title", getAtomNamespace());
if (e != null)
{
Content c = new Content();
c.setValue(parseTextConstructToString(e));
c.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
entry.setTitleEx(c);
}
List<Element> eList = getChildren(eEntry, "link");
entry.setAlternateLinks(parseAlternateLinks(feed, entry, baseURI, eList));
entry.setOtherLinks(parseOtherLinks(feed, entry, baseURI, eList));
eList = getChildren(eEntry, "author");
if (eList.size() > 0)
{
entry.setAuthors(parsePersons(baseURI, eList));
}
eList = getChildren(eEntry, "contributor");
if (eList.size() > 0)
{
entry.setContributors(parsePersons(baseURI, eList));
}
e = eEntry.getChild("id", getAtomNamespace());
if (e != null)
{
entry.setId(e.getText());
}
e = eEntry.getChild("updated", getAtomNamespace());
if (e != null)
{
entry.setUpdated(DateParser.parseW3CDateTime(e.getText()));
}
e = eEntry.getChild("published", getAtomNamespace());
if (e != null)
{
entry.setPublished(DateParser.parseW3CDateTime(e.getText()));
}
e = eEntry.getChild("summary", getAtomNamespace());
if (e != null)
{
entry.setSummary(parseContent(e));
}
e = eEntry.getChild("content", getAtomNamespace());
if (e != null)
{
List<Content> contents = new ArrayList<Content>();
contents.add(parseContent(e));
entry.setContents(contents);
}
e = eEntry.getChild("rights", getAtomNamespace());
if (e != null)
{
entry.setRights(e.getText());
}
List<Element> cList = getChildren(eEntry, "category");
entry.setCategories(parseCategories(baseURI, cList));
// TODO: SHOULD handle Atom entry source element
entry.setModules(parseItemModules(eEntry));
List foreignMarkup =
extractForeignMarkup(eEntry, entry, getAtomNamespace());
if (foreignMarkup.size() > 0)
{
entry.setForeignMarkup(foreignMarkup);
}
return entry;
}
private List<Category> parseCategories(String baseURI, List<Element> eCategories)
{
List<Category> cats = new ArrayList<Category>();
for (Element eCategory : eCategories) cats.add(parseCategory(baseURI, eCategory));
return (cats.size() > 0) ? cats : null;
}
private Category parseCategory(String baseURI, Element eCategory)
{
Category category = new Category();
String att = eCategory.getAttributeValue("term");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
category.setTerm(att);
}
att = eCategory.getAttributeValue("scheme");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
category.setScheme(resolveURI(baseURI, eCategory, att));
}
att = eCategory.getAttributeValue("label");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
if (att != null)
{
category.setLabel(att);
}
return category;
}
// Fix for issue #34 "valid IRI href attributes are stripped for atom:link"
// URI's that didn't start with http were being treated as relative URIs.
// So now consider an absolute URI to be any alpha-numeric string followed
// by a colon, followed by anything -- specified by this regex:
static Pattern absoluteURIPattern = Pattern.compile("^[a-z0-9]*:.*$");
private boolean isAbsoluteURI(String uri)
{
return absoluteURIPattern.matcher(uri).find();
}
private boolean isRelativeURI(String uri)
{
return !isAbsoluteURI(uri);
}
/**
* Resolve URI based considering xml:base and baseURI.
*
* @param baseURI Base URI of feed
* @param parent Parent from which to consider xml:base
* @param url URL to be resolved
*
* @return URI.
*/
private String resolveURI(String baseURI, Parent parent, String url)
{
if (isRelativeURI(url))
{
url = (!".".equals(url) && !"./".equals(url)) ? url : "";
// Relative URI with parent
if (parent != null && parent instanceof Element)
{
// Do we have an xml:base?
String xmlbase = ((Element)parent).getAttributeValue(
"base", Namespace.XML_NAMESPACE);
if (xmlbase != null && xmlbase.trim().length() > 0)
{
if (isAbsoluteURI(xmlbase))
{
// Absolute xml:base, so form URI right now
if (url.startsWith("/"))
{
// Host relative URI
int slashslash = xmlbase.indexOf("//");
int nextslash = xmlbase.indexOf("/", slashslash + 2);
if (nextslash != -1) xmlbase = xmlbase.substring(0, nextslash);
return formURI(xmlbase, url);
}
if (!xmlbase.endsWith("/"))
{
// Base URI is filename, strip it off
xmlbase = xmlbase.substring(0, xmlbase.lastIndexOf("/") + 1);
}
return formURI(xmlbase, url);
} else
{
// Relative xml:base, so walk up tree
return resolveURI(baseURI, parent.getParent(),
stripTrailingSlash(xmlbase) + "/" + stripStartingSlash(url));
}
}
// No xml:base so walk up tree
return resolveURI(baseURI, parent.getParent(), url);
// Relative URI with no parent (i.e. top of tree), so form URI right now
} else if (parent == null || parent instanceof Document)
{
return formURI(baseURI, url);
}
}
return url;
}
/**
* Find base URI of feed considering relative URIs.
*
* @param root Root element of feed.
*
* @return base URI.
*
* @throws java.net.MalformedURLException if error.
*/
private String findBaseURI(Element root) throws MalformedURLException
{
String ret = findAtomLink(root, "alternate");
if (ret != null && isRelativeURI(ret))
{
String self = findAtomLink(root, "self");
if (self != null)
{
self = resolveURI(null, root, self);
ret = resolveURI(self, root, ret);
}
}
return ret;
}
/**
* Return URL string of Atom link element under parent element.
* Link with no rel attribute is considered to be rel="alternate"
*
* @param parent Consider only children of this parent element
* @param rel Consider only links with this relationship
*
* @return atom link.
*/
private String findAtomLink(Element parent, String rel)
{
String ret = null;
List<Element> linksList = (List<Element>)parent.getChildren("link", ns);
if (linksList != null)
{
for (Element link : linksList)
{
Attribute relAtt = link.getAttribute("rel");
Attribute hrefAtt = link.getAttribute("href");
if ((relAtt == null && "alternate".equals(rel))
|| (relAtt != null && relAtt.getValue().equals(rel)))
{
ret = hrefAtt.getValue();
break;
}
}
}
return ret;
}
/**
* Strip starting slash from beginning of string.
*/
private static String stripStartingSlash(String s)
{
if (s != null && s.startsWith("/"))
{
s = s.substring(1, s.length());
}
return s;
}
/**
* Strip trailing slash from end of string.
*/
private static String stripTrailingSlash(String s)
{
if (s != null && s.endsWith("/"))
{
s = s.substring(0, s.length() - 1);
}
return s;
}
/**
* Resolves two URI into one.
*
* @param base base URI (optional).
* @param child child URI (optional).
*
* @return resulting URI or <code>NULL</code> if errors out.
*/
static String formURI(String base, String child)
{
if (base == null) return child;
if (child == null) return base;
try
{
return new URI(base).resolve(child).toString();
} catch (URISyntaxException e)
{
return null;
}
}
}