Source Code of de.nava.informa.parsers.Atom_1_0_Parser

// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// Niko Schmuck
// http://sourceforge.net/projects/informa
// mailto:niko_schmuck@users.sourceforge.net
//
// This library is free software.
//
// You may redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License as published by the Free Software Foundation.
//
// Version 2.1 of the license should be included with this distribution in
// the file LICENSE. If the license is not included with this distribution,
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
// MA 02139 USA.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied waranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//


// $Id: Atom_1_0_Parser.java,v 1.10 2007/01/06 21:33:45 niko_schmuck Exp $
package de.nava.informa.parsers;


import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.List;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom.Element;
import org.jdom.Namespace;


import de.nava.informa.core.ChannelBuilderIF;
import de.nava.informa.core.ChannelFormat;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ChannelParserIF;
import de.nava.informa.core.ItemIF;
import de.nava.informa.core.ParseException;
import de.nava.informa.utils.AtomParserUtils;
import de.nava.informa.utils.ParserUtils;




/**
 * Parser which reads in document instances according to the Atom 1.0
 * specification and generates a news channel object. Currently the support for
 * the atom syntax is not complete.
 *
 * This parser is based on code of Atom 0.3 parser. Some good sources of
 * information regarding Atom 1.0 are:
 * http://www.atomenabled.org/developers/syndication/atom-format-spec.php
 * http://rakaz.nl/item/moving_from_atom_03_to_10
 * http://www.atomenabled.org/developers/syndication/
 * http://www-128.ibm.com/developerworks/xml/library/x-atom10.html
 *
 * @author Nilesh Bansal
 */
public class Atom_1_0_Parser implements ChannelParserIF {
  private static final Log LOGGER = LogFactory.getLog(Atom_1_0_Parser.class);


  /**
   * Private constructor suppresses generation of a (public) default constructor.
   */
  private Atom_1_0_Parser() {}


  /**
   * Holder of the Atom_1_0_Parser instance.
   */
  private static class Atom_1_0_ParserHolder {
    private static Atom_1_0_Parser instance = new Atom_1_0_Parser();
  } 


  /**
   * Get the Atom_1_0_Parser instance.
   */
  public static Atom_1_0_Parser getInstance() {
    return Atom_1_0_ParserHolder.instance;
  }
  
  /**
   * a semicolon separated list of authors
   */
  static String getAuthorString(List authors, Namespace defNS) {
    String author = "";
    Iterator authorsIt = authors.iterator();


    while (authorsIt.hasNext()) {
      Element authorElt = (Element) authorsIt.next();


      if (authorElt != null) {
        //TODO author may have more information like uri and email
        ParserUtils.matchCaseOfChildren(authorElt, "name");


        if (!"".equals(author)) {
          // if more than one author, a ; separated list
          author += "; ";
        }


        author += authorElt.getChildTextTrim("name", defNS);
      }
    }


    return author;
  }


  /** Returns the content type of element. Default is 'text'. */
  private static String getContentType(Element elt) {
    String type = elt.getAttributeValue("type");


    return (type == null) ? "text" : type;
  }


  /**
   * Looks for "content" elements and takes first from them or looks for
   * "summary" element if "content" not found.
   *
   * @param item item element.
   * @param namespace namespace.
   *
   * @return description for item.
   */
  public static String getDescription(Element item, Namespace namespace) {
    String strDesc = "";
    Element elDesc;


    // TODO there should be some way of knowing if we are returning summary or
    // content
    List contents = item.getChildren("content", namespace);


    if (contents.size() > 0) {
      elDesc = (Element) contents.get(0);
    } else {
      elDesc = item.getChild("summary", namespace);
    }


    if (elDesc != null) {
      strDesc = AtomParserUtils.getValue(elDesc, getMode(elDesc));
    }


    return strDesc;
  }


  /**
   * returns mode of a element based on its mime type
   */
  static String getMode(Element elt) {
    return getMode(getContentType(elt));
  }


  static String getMode(String type) {
    if (type == null) {
      // if type is not specified, text is to be assumed
      return "escaped";
    }


    if ("text".equals(type) || "html".equals(type)) {
      return "escaped";
    } else if ("xhtml".equals(type)) {
      return "xml";
    } else if (type.substring(type.length() - "xml".length()).equals("xml")) {
      return "xml";
    } else {
      return "base64";
    }
  }


  /**
   * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
   */
  public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel)
      throws ParseException {
    if (cBuilder == null) {
      throw new RuntimeException("Without builder no channel can "
          + "be created.");
    }


    Date dateParsed = new Date();
    Namespace defNS = ParserUtils.getDefaultNS(channel);


    if (defNS == null) {
      defNS = Namespace.NO_NAMESPACE;
      LOGGER.info("No default namespace found.");
    } else if ((defNS.getURI() == null) ||
               !defNS.getURI().equals("http://www.w3.org/2005/Atom")) {
      LOGGER.warn("Namespace is not really supported, still trying assuming Atom 1.0 format");
    }


    LOGGER.debug("start parsing.");


    // --- read in channel information


    // Lower the case of these tags to simulate case-insensitive parsing
    ParserUtils.matchCaseOfChildren(channel,
                                    new String[] {
                                      "title", "subtitle", "updated",
                                      "published", "author", "generator",
                                      "rights", "link", "entry"
                                    });


    // TODO icon and logo: Feed element can have upto 1 logo and icon.
    // TODO id: Feed and all entries have a unique id string. This can
    // be the URL of the website. Supporting this will require API change.
    // TODO: Feed can optionally have category information


    // title element
    ChannelIF chnl = cBuilder.createChannel(channel,
                                            channel.getChildTextTrim("title",
                                                                     defNS));


    chnl.setFormat(ChannelFormat.ATOM_1_0);


    // description element
    if (channel.getChild("subtitle") != null) {
      chnl.setDescription(channel.getChildTextTrim("subtitle", defNS));
    }


    // TODO: should we use summary element?


    // lastbuild element : updated ?
    Element updated = channel.getChild("updated", defNS);


    if (updated != null) {
      chnl.setPubDate(ParserUtils.getDate(updated.getTextTrim()));
    }


    // author element
    List authors = channel.getChildren("author", defNS);


    chnl.setCreator(getAuthorString(authors, defNS));


    // TODO we are ignoring contributors information


    // generator element
    Element generator = channel.getChild("generator", defNS);


    if (generator != null) {
      chnl.setGenerator(generator.getTextTrim());
    }


    // TODO generator can have URI and version information


    // copyright element
    Element rights = channel.getChild("rights", defNS);


    if (rights != null) {
      chnl.setCopyright(AtomParserUtils.getValue(rights, getMode(rights)));
    }


    List links = channel.getChildren("link", defNS);
    Iterator i = links.iterator();


    URL linkUrl = null;


    while (i.hasNext()) {
      Element linkElement = (Element) i.next();


      // use first 'alternate' link
      // if rel is not present, use first link without rel
      String rel = linkElement.getAttributeValue("rel");
      String href = linkElement.getAttributeValue("href");


      // TODO we need to handle relative links also
      if ((rel == null) && (href != null) && (linkUrl == null)) {
        linkUrl = ParserUtils.getURL(href);
      } else if ((rel != null) && (href != null) && rel.equals("alternate")) {
        linkUrl = ParserUtils.getURL(href);


        break;
      }
    }


    if (linkUrl != null) {
      chnl.setSite(linkUrl);
    }


    List items = channel.getChildren("entry", defNS);


    i = items.iterator();


    while (i.hasNext()) {
      Element item = (Element) i.next();


      // Lower the case of these tags to simulate case-insensitive parsing
      ParserUtils.matchCaseOfChildren(item,
                                      new String[] {
                                        "title", "link", "content", "summary",
                                        "published", "author"
                                      });


      // TODO entry, if copied from some other feed, may have source element
      // TODO each entry can have its own rights declaration


      // get title element
      Element elTitle = item.getChild("title", defNS);
      String strTitle = "<No Title>";


      if (elTitle != null) {
        strTitle = AtomParserUtils.getValue(elTitle, getMode(elTitle));
        LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" +
                     strTitle);
      }


      if (LOGGER.isDebugEnabled()) {
        LOGGER.debug("Entry element found (" + strTitle + ").");
      }


      // get link element
      String strLink = AtomParserUtils.getItemLink(item, defNS);


      // get description element
      String strDesc = getDescription(item, defNS);


      // generate new news item (link to article)
      ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
                                           ParserUtils.getURL(strLink));


      //TODO enclosure data
      curItem.setFound(dateParsed);


      List itemAuthors = item.getChildren("author", defNS);


      curItem.setCreator(getAuthorString(itemAuthors, defNS));


      // get published element
      Element elIssued = item.getChild("published", defNS);


      if (elIssued == null) {
        // published element may not be present (but updated should be)
        Element elUpdated = item.getChild("updated", defNS);


        // TODO there should be some way to determining which one are we
        // returning
        if (elUpdated != null) {
          curItem.setDate(ParserUtils.getDate(elUpdated.getTextTrim()));
        }
      } else {
        curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
      }


      // get category element
      Element elCategory = item.getChild("category", defNS);


      if (elCategory != null) {
        // TODO: multiple category elements may be present
        curItem.setSubject(elCategory.getTextTrim());
      }
    }


    // set to current date
    chnl.setLastUpdated(dateParsed);


    return chnl;
  }
}
Source Code of de.nava.informa.parsers.Atom_1_0_Parser

Related Classes of de.nava.informa.parsers.Atom_1_0_Parser