/*
* Copyright 2004 Sun Microsystems, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.rometools.rome.io.impl;
import java.io.IOException;
import java.io.Reader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import org.jdom2.Attribute;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.Namespace;
import org.jdom2.Parent;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.XMLOutputter;
import com.rometools.rome.feed.WireFeed;
import com.rometools.rome.feed.atom.Category;
import com.rometools.rome.feed.atom.Content;
import com.rometools.rome.feed.atom.Entry;
import com.rometools.rome.feed.atom.Feed;
import com.rometools.rome.feed.atom.Generator;
import com.rometools.rome.feed.atom.Link;
import com.rometools.rome.feed.atom.Person;
import com.rometools.rome.feed.synd.SyndPerson;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.WireFeedInput;
import com.rometools.rome.io.WireFeedOutput;
import com.rometools.utils.Lists;
/**
* Parser for Atom 1.0
*
* @author Dave Johnson
*/
public class Atom10Parser extends BaseWireFeedParser {
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
private static boolean resolveURIs = false;
public static void setResolveURIs(final boolean resolveURIs) {
Atom10Parser.resolveURIs = resolveURIs;
}
public static boolean getResolveURIs() {
return resolveURIs;
}
public Atom10Parser() {
this("atom_1.0");
}
protected Atom10Parser(final String type) {
super(type, ATOM_10_NS);
}
protected Namespace getAtomNamespace() {
return ATOM_10_NS;
}
@Override
public boolean isMyType(final Document document) {
final Element rssRoot = document.getRootElement();
final Namespace defaultNS = rssRoot.getNamespace();
return defaultNS != null && defaultNS.equals(getAtomNamespace());
}
@Override
public WireFeed parse(final Document document, final boolean validate, final Locale locale) throws IllegalArgumentException, FeedException {
if (validate) {
validateFeed(document);
}
final Element rssRoot = document.getRootElement();
return parseFeed(rssRoot, locale);
}
protected void validateFeed(final Document document) throws FeedException {
// TBD here we have to validate the Feed against a schema or whatever not sure how to do it
// one posibility would be to produce an ouput and attempt to parse it again with validation
// turned on. otherwise will have to check the document elements by hand.
}
protected WireFeed parseFeed(final Element eFeed, final Locale locale) throws FeedException {
String baseURI = null;
try {
baseURI = findBaseURI(eFeed);
} catch (final Exception e) {
throw new FeedException("ERROR while finding base URI of feed", e);
}
final Feed feed = parseFeedMetadata(baseURI, eFeed, locale);
feed.setStyleSheet(getStyleSheet(eFeed.getDocument()));
final String xmlBase = eFeed.getAttributeValue("base", Namespace.XML_NAMESPACE);
if (xmlBase != null) {
feed.setXmlBase(xmlBase);
}
feed.setModules(parseFeedModules(eFeed, locale));
final List<Element> eList = eFeed.getChildren("entry", getAtomNamespace());
if (!eList.isEmpty()) {
feed.setEntries(parseEntries(feed, baseURI, eList, locale));
}
final List<Element> foreignMarkup = extractForeignMarkup(eFeed, feed, getAtomNamespace());
if (!foreignMarkup.isEmpty()) {
feed.setForeignMarkup(foreignMarkup);
}
return feed;
}
private Feed parseFeedMetadata(final String baseURI, final Element eFeed, final Locale locale) {
final com.rometools.rome.feed.atom.Feed feed = new com.rometools.rome.feed.atom.Feed(getType());
final Element title = eFeed.getChild("title", getAtomNamespace());
if (title != null) {
final Content c = new Content();
c.setValue(parseTextConstructToString(title));
c.setType(getAttributeValue(title, "type"));
feed.setTitleEx(c);
}
final List<Element> links = eFeed.getChildren("link", getAtomNamespace());
feed.setAlternateLinks(parseAlternateLinks(feed, null, baseURI, links));
feed.setOtherLinks(parseOtherLinks(feed, null, baseURI, links));
final List<Element> categories = eFeed.getChildren("category", getAtomNamespace());
feed.setCategories(parseCategories(baseURI, categories));
final List<Element> authors = eFeed.getChildren("author", getAtomNamespace());
if (!authors.isEmpty()) {
feed.setAuthors(parsePersons(baseURI, authors, locale));
}
final List<Element> contributors = eFeed.getChildren("contributor", getAtomNamespace());
if (!contributors.isEmpty()) {
feed.setContributors(parsePersons(baseURI, contributors, locale));
}
final Element subtitle = eFeed.getChild("subtitle", getAtomNamespace());
if (subtitle != null) {
final Content content = new Content();
content.setValue(parseTextConstructToString(subtitle));
content.setType(getAttributeValue(subtitle, "type"));
feed.setSubtitle(content);
}
final Element id = eFeed.getChild("id", getAtomNamespace());
if (id != null) {
feed.setId(id.getText());
}
final Element generator = eFeed.getChild("generator", getAtomNamespace());
if (generator != null) {
final Generator gen = new Generator();
gen.setValue(generator.getText());
final String uri = getAttributeValue(generator, "uri");
if (uri != null) {
gen.setUrl(uri);
}
final String version = getAttributeValue(generator, "version");
if (version != null) {
gen.setVersion(version);
}
feed.setGenerator(gen);
}
final Element rights = eFeed.getChild("rights", getAtomNamespace());
if (rights != null) {
feed.setRights(parseTextConstructToString(rights));
}
final Element icon = eFeed.getChild("icon", getAtomNamespace());
if (icon != null) {
feed.setIcon(icon.getText());
}
final Element logo = eFeed.getChild("logo", getAtomNamespace());
if (logo != null) {
feed.setLogo(logo.getText());
}
final Element updated = eFeed.getChild("updated", getAtomNamespace());
if (updated != null) {
feed.setUpdated(DateParser.parseDate(updated.getText(), locale));
}
return feed;
}
private Link parseLink(final Feed feed, final Entry entry, final String baseURI, final Element eLink) {
final Link link = new Link();
final String rel = getAttributeValue(eLink, "rel");
if (rel != null) {
link.setRel(rel);
}
final String type = getAttributeValue(eLink, "type");
if (type != null) {
link.setType(type);
}
final String href = getAttributeValue(eLink, "href");
if (href != null) {
link.setHref(href);
if (isRelativeURI(href)) {
link.setHrefResolved(resolveURI(baseURI, eLink, href));
}
}
final String title = getAttributeValue(eLink, "title");
if (title != null) {
link.setTitle(title);
}
final String hrefLang = getAttributeValue(eLink, "hreflang");
if (hrefLang != null) {
link.setHreflang(hrefLang);
}
final String length = getAttributeValue(eLink, "length");
if (length != null) {
final Long val = NumberParser.parseLong(length);
if (val != null) {
link.setLength(val.longValue());
}
}
return link;
}
// List(Elements) -> List(Link)
private List<Link> parseAlternateLinks(final Feed feed, final Entry entry, final String baseURI, final List<Element> eLinks) {
final List<Link> links = new ArrayList<Link>();
for (final Element eLink : eLinks) {
final Link link = parseLink(feed, entry, baseURI, eLink);
if (link.getRel() == null || "".equals(link.getRel().trim()) || "alternate".equals(link.getRel())) {
links.add(link);
}
}
return Lists.emptyToNull(links);
}
private List<Link> parseOtherLinks(final Feed feed, final Entry entry, final String baseURI, final List<Element> eLinks) {
final List<Link> links = new ArrayList<Link>();
for (final Element eLink : eLinks) {
final Link link = parseLink(feed, entry, baseURI, eLink);
if (!"alternate".equals(link.getRel())) {
links.add(link);
}
}
return Lists.emptyToNull(links);
}
private Person parsePerson(final String baseURI, final Element ePerson, final Locale locale) {
final Person person = new Person();
final Element name = ePerson.getChild("name", getAtomNamespace());
if (name != null) {
person.setName(name.getText());
}
final Element uri = ePerson.getChild("uri", getAtomNamespace());
if (uri != null) {
person.setUri(uri.getText());
if (isRelativeURI(uri.getText())) {
person.setUriResolved(resolveURI(baseURI, ePerson, uri.getText()));
}
}
final Element email = ePerson.getChild("email", getAtomNamespace());
if (email != null) {
person.setEmail(email.getText());
}
person.setModules(parsePersonModules(ePerson, locale));
return person;
}
// List(Elements) -> List(Persons)
private List<SyndPerson> parsePersons(final String baseURI, final List<Element> ePersons, final Locale locale) {
final List<SyndPerson> persons = new ArrayList<SyndPerson>();
for (final Element ePerson : ePersons) {
persons.add(parsePerson(baseURI, ePerson, locale));
}
return Lists.emptyToNull(persons);
}
private Content parseContent(final Element e) {
final String value = parseTextConstructToString(e);
final String src = getAttributeValue(e, "src");
final String type = getAttributeValue(e, "type");
final Content content = new Content();
content.setSrc(src);
content.setType(type);
content.setValue(value);
return content;
}
private String parseTextConstructToString(final Element e) {
String type = getAttributeValue(e, "type");
if (type == null) {
type = Content.TEXT;
}
String value = null;
if (type.equals(Content.XHTML) || type.indexOf("/xml") != -1 || type.indexOf("+xml") != -1) {
// XHTML content needs special handling
final XMLOutputter outputter = new XMLOutputter();
final List<org.jdom2.Content> contents = e.getContent();
for (final org.jdom2.Content content : contents) {
if (content instanceof Element) {
final Element element = (Element) content;
if (element.getNamespace().equals(getAtomNamespace())) {
element.setNamespace(Namespace.NO_NAMESPACE);
}
}
}
value = outputter.outputString(contents);
} else {
// Everything else comes in verbatim
value = e.getText();
}
return value;
}
// List(Elements) -> List(Entries)
protected List<Entry> parseEntries(final Feed feed, final String baseURI, final List<Element> eEntries, final Locale locale) {
final List<Entry> entries = new ArrayList<Entry>();
for (final Element entry : eEntries) {
entries.add(this.parseEntry(feed, entry, baseURI, locale));
}
return Lists.emptyToNull(entries);
}
protected Entry parseEntry(final Feed feed, final Element eEntry, final String baseURI, final Locale locale) {
final Entry entry = new Entry();
final String xmlBase = eEntry.getAttributeValue("base", Namespace.XML_NAMESPACE);
if (xmlBase != null) {
entry.setXmlBase(xmlBase);
}
final Element title = eEntry.getChild("title", getAtomNamespace());
if (title != null) {
final Content c = new Content();
c.setValue(parseTextConstructToString(title));
c.setType(getAttributeValue(title, "type"));
entry.setTitleEx(c);
}
final List<Element> links = eEntry.getChildren("link", getAtomNamespace());
entry.setAlternateLinks(parseAlternateLinks(feed, entry, baseURI, links));
entry.setOtherLinks(parseOtherLinks(feed, entry, baseURI, links));
final List<Element> authors = eEntry.getChildren("author", getAtomNamespace());
if (!authors.isEmpty()) {
entry.setAuthors(parsePersons(baseURI, authors, locale));
}
final List<Element> contributors = eEntry.getChildren("contributor", getAtomNamespace());
if (!contributors.isEmpty()) {
entry.setContributors(parsePersons(baseURI, contributors, locale));
}
final Element id = eEntry.getChild("id", getAtomNamespace());
if (id != null) {
entry.setId(id.getText());
}
final Element updated = eEntry.getChild("updated", getAtomNamespace());
if (updated != null) {
entry.setUpdated(DateParser.parseDate(updated.getText(), locale));
}
final Element published = eEntry.getChild("published", getAtomNamespace());
if (published != null) {
entry.setPublished(DateParser.parseDate(published.getText(), locale));
}
final Element summary = eEntry.getChild("summary", getAtomNamespace());
if (summary != null) {
entry.setSummary(parseContent(summary));
}
final Element content = eEntry.getChild("content", getAtomNamespace());
if (content != null) {
final List<Content> contents = new ArrayList<Content>();
contents.add(parseContent(content));
entry.setContents(contents);
}
final Element rights = eEntry.getChild("rights", getAtomNamespace());
if (rights != null) {
entry.setRights(rights.getText());
}
final List<Element> categories = eEntry.getChildren("category", getAtomNamespace());
entry.setCategories(parseCategories(baseURI, categories));
// TODO: SHOULD handle Atom entry source element
final Element source = eEntry.getChild("source", getAtomNamespace());
if (source != null) {
entry.setSource(parseFeedMetadata(baseURI, source, locale));
}
entry.setModules(parseItemModules(eEntry, locale));
final List<Element> foreignMarkup = extractForeignMarkup(eEntry, entry, getAtomNamespace());
if (!foreignMarkup.isEmpty()) {
entry.setForeignMarkup(foreignMarkup);
}
return entry;
}
private List<Category> parseCategories(final String baseURI, final List<Element> eCategories) {
final List<Category> cats = new ArrayList<Category>();
for (final Element eCategory : eCategories) {
cats.add(parseCategory(baseURI, eCategory));
}
return Lists.emptyToNull(cats);
}
private Category parseCategory(final String baseURI, final Element eCategory) {
final Category category = new Category();
final String term = getAttributeValue(eCategory, "term");
if (term != null) {
category.setTerm(term);
}
final String scheme = getAttributeValue(eCategory, "scheme");
if (scheme != null) {
category.setScheme(scheme);
if (isRelativeURI(scheme)) {
category.setSchemeResolved(resolveURI(baseURI, eCategory, scheme));
}
}
final String label = getAttributeValue(eCategory, "label");
if (label != null) {
category.setLabel(label);
}
return category;
}
// Once following relative URI methods are made public in the ROME
// Atom10Parser, then use them instead and delete these.
// Fix for issue #34 "valid IRI href attributes are stripped for atom:link"
// URI's that didn't start with http were being treated as relative URIs.
// So now consider an absolute URI to be any alpha-numeric string followed
// by a colon, followed by anything -- specified by this regex:
static Pattern absoluteURIPattern = Pattern.compile("^[a-z0-9]*:.*$");
public static boolean isAbsoluteURI(final String uri) {
return absoluteURIPattern.matcher(uri).find();
}
/** Returns true if URI is relative. */
public static boolean isRelativeURI(final String uri) {
return !isAbsoluteURI(uri);
}
/**
* Resolve URI via base URL and parent element. Resolve URI based considering xml:base and
* baseURI.
*
* @param baseURI Base URI used to fetch the XML document
* @param parent Parent element from which to consider xml:base
* @param url URL to be resolved
*/
public static String resolveURI(final String baseURI, final Parent parent, String url) {
if (!resolveURIs) {
return url;
}
if (isRelativeURI(url)) {
if (".".equals(url) || "./".equals(url)) {
url = "";
}
if (url.startsWith("/") && baseURI != null) {
String base = null;
final int slashslash = baseURI.indexOf("//");
final int nextslash = baseURI.indexOf("/", slashslash + 2);
if (nextslash != -1) {
base = baseURI.substring(0, nextslash);
}
return formURI(base, url);
}
// Relative URI with parent
if (parent != null && parent instanceof Element) {
// Do we have an xml:base?
String xmlbase = ((Element) parent).getAttributeValue("base", Namespace.XML_NAMESPACE);
if (xmlbase != null && xmlbase.trim().length() > 0) {
if (isAbsoluteURI(xmlbase)) {
// Absolute xml:base, so form URI right now
if (url.startsWith("/")) {
// Host relative URI
final int slashslash = xmlbase.indexOf("//");
final int nextslash = xmlbase.indexOf("/", slashslash + 2);
if (nextslash != -1) {
xmlbase = xmlbase.substring(0, nextslash);
}
return formURI(xmlbase, url);
}
if (!xmlbase.endsWith("/")) {
// Base URI is filename, strip it off
xmlbase = xmlbase.substring(0, xmlbase.lastIndexOf("/"));
}
return formURI(xmlbase, url);
} else {
// Relative xml:base, so walk up tree
return resolveURI(baseURI, parent.getParent(), stripTrailingSlash(xmlbase) + "/" + stripStartingSlash(url));
}
}
// No xml:base so walk up tree
return resolveURI(baseURI, parent.getParent(), url);
// Relative URI with no parent (i.e. top of tree), so form URI
// right now
} else if (parent == null || parent instanceof Document) {
return formURI(baseURI, url);
}
}
return url;
}
/**
* Find base URI of feed considering relative URIs.
*
* @param root Root element of feed.
*/
private String findBaseURI(final Element root) throws MalformedURLException {
String ret = null;
if (findAtomLink(root, "self") != null) {
ret = findAtomLink(root, "self");
if (".".equals(ret) || "./".equals(ret)) {
ret = "";
}
if (ret.indexOf("/") != -1) {
ret = ret.substring(0, ret.lastIndexOf("/"));
}
ret = resolveURI(null, root, ret);
}
return ret;
}
/**
* Return URL string of Atom link element under parent element. Link with no rel attribute is
* considered to be rel="alternate"
*
* @param parent Consider only children of this parent element
* @param rel Consider only links with this relationship
*/
private String findAtomLink(final Element parent, final String rel) {
String ret = null;
final List<Element> linksList = parent.getChildren("link", ATOM_10_NS);
if (linksList != null) {
for (final Element element : linksList) {
final Element link = element;
final Attribute relAtt = getAttribute(link, "rel");
final Attribute hrefAtt = getAttribute(link, "href");
if (relAtt == null && "alternate".equals(rel) || relAtt != null && relAtt.getValue().equals(rel)) {
ret = hrefAtt.getValue();
break;
}
}
}
return ret;
}
/**
* Form URI by combining base with append portion and giving special consideration to append
* portions that begin with ".."
*
* @param base Base of URI, may end with trailing slash
* @param append String to append, may begin with slash or ".."
*/
private static String formURI(String base, String append) {
base = stripTrailingSlash(base);
append = stripStartingSlash(append);
if (append.startsWith("..")) {
final String[] parts = append.split("/");
for (final String part : parts) {
if ("..".equals(part)) {
final int last = base.lastIndexOf("/");
if (last != -1) {
base = base.substring(0, last);
append = append.substring(3, append.length());
} else {
break;
}
}
}
}
return base + "/" + append;
}
/**
* Strip starting slash from beginning of string.
*/
private static String stripStartingSlash(String s) {
if (s != null && s.startsWith("/")) {
s = s.substring(1, s.length());
}
return s;
}
/**
* Strip trailing slash from end of string.
*/
private static String stripTrailingSlash(String s) {
if (s != null && s.endsWith("/")) {
s = s.substring(0, s.length() - 1);
}
return s;
}
/**
* Parse entry from reader.
*/
public static Entry parseEntry(final Reader rd, final String baseURI, final Locale locale) throws JDOMException, IOException, IllegalArgumentException,
FeedException {
// Parse entry into JDOM tree
final SAXBuilder builder = new SAXBuilder();
final Document entryDoc = builder.build(rd);
final Element fetchedEntryElement = entryDoc.getRootElement();
fetchedEntryElement.detach();
// Put entry into a JDOM document with 'feed' root so that Rome can
// handle it
final Feed feed = new Feed();
feed.setFeedType("atom_1.0");
final WireFeedOutput wireFeedOutput = new WireFeedOutput();
final Document feedDoc = wireFeedOutput.outputJDom(feed);
feedDoc.getRootElement().addContent(fetchedEntryElement);
if (baseURI != null) {
feedDoc.getRootElement().setAttribute("base", baseURI, Namespace.XML_NAMESPACE);
}
final WireFeedInput input = new WireFeedInput(false, locale);
final Feed parsedFeed = (Feed) input.build(feedDoc);
return parsedFeed.getEntries().get(0);
}
}