Package de.nava.informa.parsers

Source Code of de.nava.informa.parsers.RSS_2_0_Parser

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// Niko Schmuck
// http://sourceforge.net/projects/informa
// mailto:niko_schmuck@users.sourceforge.net
//
// This library is free software.
//
// You may redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License as published by the Free Software Foundation.
//
// Version 2.1 of the license should be included with this distribution in
// the file LICENSE. If the license is not included with this distribution,
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
// MA 02139 USA.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied waranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//

package de.nava.informa.parsers;

import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom.Attribute;
import org.jdom.Element;
import org.jdom.Namespace;

import de.nava.informa.core.CategoryIF;
import de.nava.informa.core.ChannelBuilderIF;
import de.nava.informa.core.ChannelFormat;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ChannelParserIF;
import de.nava.informa.core.ImageIF;
import de.nava.informa.core.ItemEnclosureIF;
import de.nava.informa.core.ItemGuidIF;
import de.nava.informa.core.ItemIF;
import de.nava.informa.core.ItemSourceIF;
import de.nava.informa.core.ParseException;
import de.nava.informa.core.TextInputIF;
import de.nava.informa.impl.basic.ChannelBuilder;
import de.nava.informa.utils.ParserUtils;

/**
* Parser which reads in document instances according to the RSS 2.0
* specification and generates a news channel object.
*
* @author Anthony Eden, Niko Schmuck
*/
class RSS_2_0_Parser implements ChannelParserIF {

  private static Log logger = LogFactory.getLog(RSS_2_0_Parser.class);
 
  /**
   * Private constructor suppresses generation of a (public) default constructor.
   */
  private RSS_2_0_Parser() {}

  /**
   * Holder of the RSS_2_0_Parser instance.
   */
  private static class RSS_2_0_ParserHolder {
    private static RSS_2_0_Parser instance = new RSS_2_0_Parser();
  }

  /**
   * Get the RSS_2_0_Parser instance.
   */
  public static RSS_2_0_Parser getInstance() {
    return RSS_2_0_ParserHolder.instance;
  }

  private CategoryIF getCategoryList(CategoryIF parent, String title,
      Hashtable children) {
    // Assuming category hierarchy for each category element
    // is already mapped out into Hashtable tree;  Hense the children Hashtable

    // create channel builder to help create CategoryIF objects
    ChannelBuilder builder = new ChannelBuilder();

    // create current CategoryIF object; Parent may be null if at top level
    CategoryIF cat = builder.createCategory(parent, title);
    // iterate off list of keys from children list
    Enumeration itChild = children.keys();
    while (itChild.hasMoreElements()) {
      String childKey = (String) itChild.nextElement();
      // don't need to keep track of return CategoryIF since it will be added as child of another instance
      getCategoryList(cat, childKey, (Hashtable) children.get(childKey));
    }
    return cat;
  }

  /**
   * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
   */
  public ChannelIF parse(ChannelBuilderIF cBuilder, Element root)
      throws ParseException {
    if (cBuilder == null) {
      throw new RuntimeException("Without builder no channel can "
          + "be created.");
    }
    Date dateParsed = new Date();
    logger.debug("start parsing.");

    Namespace defNS = ParserUtils.getDefaultNS(root);
    if (defNS == null) {
      defNS = Namespace.NO_NAMESPACE;
      logger.info("No default namespace found.");
    }
    Namespace dcNS = ParserUtils.getNamespace(root, "dc");
    // fall back to default name space
    if (dcNS == null) {
      dcNS = defNS;
    }

    // Content namespace
    Namespace contentNS = ParserUtils.getNamespace(root, "content");

    ParserUtils.matchCaseOfChildren(root, "channel");

    // Get the channel element (only one occurs)
    Element channel = root.getChild("channel", defNS);
    if (channel == null) {
      logger.warn("Channel element could not be retrieved from feed.");
      throw new ParseException("No channel element found in feed.");
    }

    // --- read in channel information

    ParserUtils.matchCaseOfChildren(channel, new String[] { "title",
        "description", "link", "language", "item", "image", "textinput",
        "copyright", "rating", "docs", "generator", "pubDate", "lastBuildDate",
        "category", "managingEditor", "webMaster", "cloud" });

    // 1 title element
    ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim(
        "title", defNS));

    // set channel format
    chnl.setFormat(ChannelFormat.RSS_2_0);

    // 1 description element
    chnl.setDescription(channel.getChildTextTrim("description", defNS));

    // 1 link element
    chnl.setSite(ParserUtils.getURL(channel.getChildTextTrim("link", defNS)));

    // 1 language element
    chnl.setLanguage(channel.getChildTextTrim("language", defNS));

    // 1..n item elements
    List items = channel.getChildren("item", defNS);
    Iterator i = items.iterator();
    while (i.hasNext()) {
      Element item = (Element) i.next();

      ParserUtils.matchCaseOfChildren(item, new String[] { "title", "link",
          "encoded", "description", "subject", "category", "pubDate", "date",
          "author", "creator", "comments", "guid", "source", "enclosure" });

      // get title element
      Element elTitle = item.getChild("title", defNS);
      String strTitle = "<No Title>";
      if (elTitle != null) {
        strTitle = elTitle.getTextTrim();
      }
      if (logger.isDebugEnabled()) {
        logger.debug("Item element found (" + strTitle + ").");
      }

      // get link element
      Element elLink = item.getChild("link", defNS);
      String strLink = "";
      if (elLink != null) {
        strLink = elLink.getTextTrim();
      }

      // get description element
      Element elDesc = item.getChild("encoded", contentNS);
      if (elDesc == null) {
        elDesc = item.getChild("description", defNS);
      }
      String strDesc = "";
      if (elDesc != null) {
        strDesc = elDesc.getTextTrim();
      }

      // generate new RSS item (link to article)
      ItemIF rssItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
          ParserUtils.getURL(strLink));

      // get subject element
      Element elSubject = item.getChild("subject", defNS);
      if (elSubject == null) {
        // fallback mechanism: get dc:subject element
        elSubject = item.getChild("subject", dcNS);
      }
      if (elSubject != null) {
        rssItem.setSubject(elSubject.getTextTrim());
      }

      // get category list
      // get list of <category> elements
      List listCategory = item.getChildren("category", defNS);
      if (listCategory.size() < 1) {
        // fallback mechanism: get dc:category element
        listCategory = item.getChildren("category", dcNS);
      }
      if (listCategory.size() > 0) {
        RecursiveHashtable<String> catTable = new RecursiveHashtable<String>();

        // for each category, parse hierarchy
        Iterator itCat = listCategory.iterator();
        while (itCat.hasNext()) {
          RecursiveHashtable<String> currTable = catTable;
          Element elCategory = (Element) itCat.next();
          // get contents of category element
          String[] titles = elCategory.getTextNormalize().split("/");
          for (int x = 0; x < titles.length; x++) {
            // tokenize category string to extract out hierarchy
            if (currTable.containsKey(titles[x]) == false) {
              // if token does not exist in current map, add it with child Hashtable
              currTable.put(titles[x], new RecursiveHashtable<String>());
            }
            // reset current Hashtable to child's Hashtable then iterate to next token
            currTable = currTable.get(titles[x]);
          }
        }
        ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>();
        // transform cat list & hierarchy into list of CategoryIF elements
        Enumeration<String> enumCategories = catTable.keys();
        while (enumCategories.hasMoreElements()) {
          String key = enumCategories.nextElement();
          // build category list: getCategoryList(parent, title, children)
          CategoryIF cat = getCategoryList(null, key, catTable.get(key));
          catList.add(cat);
        }
        if (catList.size() > 0) {
          // if categories were actually created, then add list to item node
          rssItem.setCategories(catList);
        }
      }

      // get publication date
      Element elDate = item.getChild("pubDate", defNS);
      if (elDate == null) {
        // fallback mechanism: get dc:date element
        elDate = item.getChild("date", dcNS);
      }
      if (elDate != null) {
        rssItem.setDate(ParserUtils.getDate(elDate.getTextTrim()));
      }

      rssItem.setFound(dateParsed);

      // get Author element
      Element elAuthor = item.getChild("author", defNS);
      if (elAuthor == null) {
        // fallback mechanism: get dc:creator element
        elAuthor = item.getChild("creator", dcNS);
      }
      if (elAuthor != null)
        rssItem.setCreator(elAuthor.getTextTrim());

      // get Comments element
      Element elComments = item.getChild("comments", defNS);
      String strComments = "";
      if (elComments != null) {
        strComments = elComments.getTextTrim();
      }
      rssItem.setComments(ParserUtils.getURL(strComments));

      // get guid element
      Element elGuid = item.getChild("guid", defNS);
      if (elGuid != null) {
        String guidUrl = elGuid.getTextTrim();
        if (guidUrl != null) {
          boolean permaLink = true;
          Attribute permaLinkAttribute = elGuid.getAttribute("isPermaLink",
              defNS);
          if (permaLinkAttribute != null) {
            String permaLinkStr = permaLinkAttribute.getValue();
            if (permaLinkStr != null) {
              permaLink = Boolean.valueOf(permaLinkStr).booleanValue();
            }
          }
          ItemGuidIF itemGuid = cBuilder.createItemGuid(rssItem, guidUrl,
              permaLink);
          rssItem.setGuid(itemGuid);
        }
      }

      // get source element
      Element elSource = item.getChild("source", defNS);
      if (elSource != null) {
        String sourceName = elSource.getTextTrim();
        Attribute sourceAttribute = elSource.getAttribute("url", defNS);
        if (sourceAttribute != null) {
          String sourceLocation = sourceAttribute.getValue().trim();
          ItemSourceIF itemSource = cBuilder.createItemSource(rssItem,
              sourceName, sourceLocation, null);
          rssItem.setSource(itemSource);
        }
      }

      // get enclosure element
      Element elEnclosure = item.getChild("enclosure", defNS);
      if (elEnclosure != null) {
        URL location = null;
        String type = null;
        int length = -1;
        Attribute urlAttribute = elEnclosure.getAttribute("url", defNS);
        if (urlAttribute != null) {
          location = ParserUtils.getURL(urlAttribute.getValue().trim());
        }
        Attribute typeAttribute = elEnclosure.getAttribute("type", defNS);
        if (typeAttribute != null) {
          type = typeAttribute.getValue().trim();
        }
        Attribute lengthAttribute = elEnclosure.getAttribute("length", defNS);
        if (lengthAttribute != null) {
          try {
            length = Integer.parseInt(lengthAttribute.getValue().trim());
          } catch (NumberFormatException e) {
            logger.warn(e);
          }
        }
        ItemEnclosureIF itemEnclosure = cBuilder.createItemEnclosure(rssItem,
            location, type, length);
        rssItem.setEnclosure(itemEnclosure);
      }
    }

    // 0..1 image element
    Element image = channel.getChild("image", defNS);
    if (image != null) {

      ParserUtils.matchCaseOfChildren(image, new String[] { "title", "url",
          "link", "width", "height", "description" });

      ImageIF rssImage = cBuilder.createImage(image.getChildTextTrim("title",
          defNS), ParserUtils.getURL(image.getChildTextTrim("url", defNS)),
          ParserUtils.getURL(image.getChildTextTrim("link", defNS)));
      Element imgWidth = image.getChild("width", defNS);
      if (imgWidth != null) {
        try {
          rssImage.setWidth(Integer.parseInt(imgWidth.getTextTrim()));
        } catch (NumberFormatException e) {
          logger.warn("Error parsing width: " + e.getMessage());
        }
      }
      Element imgHeight = image.getChild("height", defNS);
      if (imgHeight != null) {
        try {
          rssImage.setHeight(Integer.parseInt(imgHeight.getTextTrim()));
        } catch (NumberFormatException e) {
          logger.warn("Error parsing height: " + e.getMessage());
        }
      }
      Element imgDescr = image.getChild("description", defNS);
      if (imgDescr != null) {
        rssImage.setDescription(imgDescr.getTextTrim());
      }
      chnl.setImage(rssImage);
    }

    // 0..1 textinput element
    Element txtinp = channel.getChild("textinput", defNS);
    if (txtinp != null) {

      ParserUtils.matchCaseOfChildren(txtinp, new String[] { "title",
          "description", "name", "link" });

      TextInputIF rssTextInput = cBuilder.createTextInput(txtinp
          .getChildTextTrim("title", defNS), txtinp.getChildTextTrim(
          "description", defNS), txtinp.getChildTextTrim("name", defNS),
          ParserUtils.getURL(txtinp.getChildTextTrim("link", defNS)));
      chnl.setTextInput(rssTextInput);
    }

    // 0..1 copyright element
    Element copyright = channel.getChild("copyright", defNS);
    if (copyright != null) {
      chnl.setCopyright(copyright.getTextTrim());
    }

    // 0..1 Rating element
    Element rating = channel.getChild("rating", defNS);
    if (rating != null) {
      chnl.setRating(rating.getTextTrim());
    }

    // 0..1 Docs element
    Element docs = channel.getChild("docs", defNS);
    if (docs != null) {
      chnl.setDocs(docs.getTextTrim());
    }

    // 0..1 Generator element
    Element generator = channel.getChild("generator", defNS);
    if (generator != null) {
      chnl.setGenerator(generator.getTextTrim());
    }

    // 0..1 ttl element
    Element ttl = channel.getChild("ttl", defNS);
    if (ttl != null) {
      String ttlValue = ttl.getTextTrim();
      try {
        chnl.setTtl(Integer.parseInt(ttlValue));
      } catch (NumberFormatException e) {
        logger.warn("Invalid TTL format: '" + ttlValue + "'");
      }
    }

    // 0..1 pubDate element
    Element pubDate = channel.getChild("pubDate", defNS);
    if (pubDate != null) {
      chnl.setPubDate(ParserUtils.getDate(pubDate.getTextTrim()));
    }

    // 0..1 lastBuildDate element
    Element lastBuildDate = channel.getChild("lastBuildDate", defNS);
    if (lastBuildDate != null) {
      chnl.setLastBuildDate(ParserUtils.getDate(lastBuildDate.getTextTrim()));
    }

    // get category list
    // get list of <category> elements
    List listCategory = channel.getChildren("category", defNS);
    if (listCategory.size() < 1) {
      // fallback mechanism: get dc:category element
      listCategory = channel.getChildren("category", dcNS);
    }
    if (listCategory.size() > 0) {
      RecursiveHashtable<String> catTable = new RecursiveHashtable<String>();
      // for each category, parse hierarchy
      Iterator itCat = listCategory.iterator();
      while (itCat.hasNext()) {
        RecursiveHashtable<String> currTable = catTable;
        Element elCategory = (Element) itCat.next();
        // get contents of category element
        String[] titles = elCategory.getTextNormalize().split("/");
        for (int x = 0; x < titles.length; x++) {
          // tokenize category string to extract out hierarchy
          if (currTable.containsKey(titles[x]) == false) {
            // if token does not exist in current map, add it with child Hashtable
            currTable.put(titles[x], new RecursiveHashtable<String>());
          }
          // reset current Hashtable to child's Hashtable then iterate to next token
          currTable = currTable.get(titles[x]);
        }
      }
      ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>();
      // transform cat list & hierarchy into list of CategoryIF elements
      Enumeration<String> enumCategories = catTable.keys();
      while (enumCategories.hasMoreElements()) {
        String key = enumCategories.nextElement();
        // build category list: getCategoryList(parent, title, children)
        CategoryIF cat = getCategoryList(null, key, catTable.get(key));
        catList.add(cat);
      }
      if (catList.size() > 0) {
        // if categories were actually created, then add list to item node
        chnl.setCategories(catList);
      }
    }

    // 0..1 managingEditor element
    Element managingEditor = channel.getChild("managingEditor", defNS);
    if (managingEditor != null) {
      chnl.setCreator(managingEditor.getTextTrim());
    }

    // 0..1 webMaster element
    Element webMaster = channel.getChild("webMaster", defNS);
    if (webMaster != null) {
      chnl.setPublisher(webMaster.getTextTrim());
    }

    // 0..1 cloud element
    Element cloud = channel.getChild("cloud", defNS);
    if (cloud != null) {
      String _port = cloud.getAttributeValue("port", defNS);
      int port = -1;
      if (_port != null) {
        try {
          port = Integer.parseInt(_port);
        } catch (NumberFormatException e) {
          logger.warn(e);
        }
      }
      chnl.setCloud(cBuilder.createCloud(cloud.getAttributeValue("domain",
          defNS), port, cloud.getAttributeValue("path", defNS), cloud
          .getAttributeValue("registerProcedure", defNS), cloud
          .getAttributeValue("protocol", defNS)));
    }

    chnl.setLastUpdated(dateParsed);

    // 0..1 skipHours element
    // 0..1 skipDays element

    return chnl;
  }

  /**
   * Implement type safety in a hashtable of hashtables.
   * @author Italo Borssatto
   */
  private static class RecursiveHashtable<T> extends
      Hashtable<T, RecursiveHashtable<T>> {
    /**
     * <code>serialVersionUID</code>
     */
    private static final long serialVersionUID = -3748524793347081535L;

    /**
     * @see java.util.Hashtable#put(java.lang.Object, java.lang.Object)
     */
    @Override
    public synchronized RecursiveHashtable<T> put(T key,
        RecursiveHashtable<T> value) {
      return super.put(key, value);
    }
  }

}
TOP

Related Classes of de.nava.informa.parsers.RSS_2_0_Parser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.