/*
* Copyright (c) 1999-2002 ChurchillObjects.com All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer. Redistributions in
* binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other
* materials provided with the distribution. Neither the name of the copyright
* holder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT, INCLUDING NEGLIGENCE OR OTHERWISE, ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
*/
package churchillobjects.rss4j.parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Vector;
//import org.apache.xerces.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import churchillobjects.rss4j.RssChannel;
import churchillobjects.rss4j.RssChannelImage;
import churchillobjects.rss4j.RssChannelItem;
import churchillobjects.rss4j.RssChannelTextInput;
import churchillobjects.rss4j.RssDocument;
import churchillobjects.rss4j.model.RssNamespace;
import churchillobjects.rss4j.model.RssVersion;
import org.apache.xerces.parsers.SAXParser;
/**
* This parser is really a 'bootstrap' parser, whose only real purpose
* is to identify what version the RSS document is and then instantiate
* and pass processing along to a parser appropriate to that version. Its
* functionality is available to client code only through its set of public
* static parse methods, each of which takes a different variety of input and
* converts it to an InputSource for the SAX handler.
*/
public class RssParser extends DefaultHandler implements RssVersion{
/**
* The xml reader object
*/
private XMLReader parser;
/**
* The main document object to be used as the parser runs through its events.
*/
protected RssDocument document;
/**
* Holds the current channel object while the parser executes.
*/
protected RssChannel currentChannel;
/**
* Holds the current item object while the parser executes.
*/
protected RssChannelItem currentItem;
/**
* Holds the current text input object while the parser executes.
*/
protected RssChannelTextInput currentTextInput;
/**
* Holds the current image object while the parser executes.
*/
protected RssChannelImage currentImage;
/**
* Holds a buffer of characters for the current element.
*/
private StringBuffer characters;
/**
* Holds the namespaces that were found in the document.
*/
protected Vector namespaces;
/**
* Invokes a parse of an RSS document contained in a String as unicode text.
* An input source will be created from a string reader based off of that
* string which will be used by the xml parser.
* @param rss
* @return
* @throws RssParseException
*/
public static RssDocument parseRss(String rss) throws RssParseException{
StringReader sr = new StringReader(rss);
InputSource inputSource = new InputSource(sr);
RssParser parser = new RssParser();
return parser.parseRss(inputSource);
}
/**
* Invokes a parse of an RSS document made available from an input stream.
* An input source will be created from the input stream which will be used
* by the xml parser.
* @param is
* @return
* @throws RssParseException
*/
public static RssDocument parseRss(InputStream is) throws RssParseException{
InputSource inputSource = new InputSource(is);
RssParser parser = new RssParser();
return parser.parseRss(inputSource);
}
/**
* Invokes a parse of an RSS document made available from a reader object.
* An input source will be created from the reader which will be used
* by the xml parser.
* @param r
* @return
* @throws RssParseException
*/
public static RssDocument parseRss(Reader r) throws RssParseException{
InputSource inputSource = new InputSource(r);
RssParser parser = new RssParser();
return parser.parseRss(inputSource);
}
/**
* Invokes a parse of an RSS document made available from a file object.
* An input source will be created from in input stream off of the file
* which will be used by the xml parser.
* @param file
* @return
* @throws RssParseException
*/
public static RssDocument parseRss(File file) throws RssParseException{
FileInputStream fis;
try{
fis = new FileInputStream(file);
}
catch(FileNotFoundException e){
throw new RssParseException(e);
}
InputSource inputSource = new InputSource(fis);
RssParser parser = new RssParser();
return parser.parseRss(inputSource);
}
/**
* Blank constructor, used by the static parse methods before the version of
* the document is known.
*/
RssParser(){
namespaces = new Vector();
}
/**
* In-package constructor, used by subclasses to set their already set-up
* document and namespace objects.
* @param document
* @param namespaces
*/
RssParser(RssDocument document, Vector namespaces){
this.document = document;
this.namespaces = namespaces;
}
/**
* Starts the parsing of a document from the specified input source. Through
* the SAX events, the RSS document object model is created and returned.
* If something should go wrong, then the exception is caught and thrown as
* an RssParseException, unless of course it is a SAXException wrapping an
* RssParseException, in which case the RssParseException will be unwrapped
* and rethrown. Either way, the client code only needs to worry about the
* RssParseException.
* @param is The input source to parse.
* @return The RssDocument object structure.
* @throws RssParseException
*/
RssDocument parseRss(InputSource is) throws RssParseException{
document = new RssDocument();
namespaces = new Vector();
characters = new StringBuffer();
parser = new SAXParser();
parser.setContentHandler(this);
parser.setErrorHandler(this);
try{
parser.parse(is);
}
catch(SAXException e){
cleanUp();
if(e.getException() instanceof RssParseException){
throw (RssParseException)e.getException();
}
else{
throw new RssParseException(e.getException());
}
}
catch(IOException e){
cleanUp();
throw new RssParseException(e);
}
// local reference to the doc that will be returned, before cleanup
RssDocument returnDoc = document;
// release all local objects for gc
cleanUp();
return returnDoc;
}
/**
* Releases local attributes of the parser for gc
*/
protected void cleanUp(){
parser = null;
document = null;
currentChannel = null;
currentItem = null;
currentTextInput = null;
currentImage = null;
characters = null;
namespaces = null;
}
/**
* SAX event. Signifies that a prefix mapping has been encountered. This is used to identify
* RSS document versions .90 and 1.0.
* @param prefix
* @param uri
* @throws SAXException
*/
public void startPrefixMapping(String prefix, String uri) throws SAXException{
namespaces.addElement(new RssNamespace(prefix, uri));
if(prefix.length()==0){
if("http://my.netscape.com/rdf/simple/0.9/".equals(uri)){
document.setVersion(VERSION_90);
DefaultHandler handler = new RssParserImpl090(document, namespaces);
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
}
else if("http://purl.org/rss/1.0/".equals(uri)){
document.setVersion(VERSION_10);
DefaultHandler handler = new RssParserImpl100(document, namespaces);
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
}
else{
throw new SAXException(new RssParseException("Could not determine the RSS version from namespace: " + uri));
}
}
}
/**
* SAX event. Signifies the start of an element. This method is only used
* to get the version of the RSS document being parsed; subclassed events
* will handle the data values and add them to the RSS document object model.
* @param uri
* @param name
* @param qName
* @param atts
*/
public void startElement(String uri, String name, String qName, Attributes atts) throws SAXException{
if(name.equals("rss")){
String version = atts.getValue("version");
if(version.equals("0.91")){
document.setVersion(VERSION_91);
DefaultHandler handler = new RssParserImpl091(document, null);
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
}
else if(version.equals("0.92")){
throw new SAXException(new RssParseException("RSS 0.92 not supported by this toolkit"));
}
else{
throw new SAXException(new RssParseException("Could not determine the RSS version: " + version));
}
}
if(name.equals("channel")){
// we've gone too far and not determined the version...
throw new SAXException(new RssParseException("Could not determine the RSS version of this document."));
}
}
/**
* SAX event. Adds characters to the string buffer. This may be called several
* times for an element even though only a few characters are present.
* @param ch
* @param start
* @param length
*/
public void characters(char ch[], int start, int length){
if(characters==null){
characters = new StringBuffer();
}
characters.append(ch, start, length);
}
/**
* Returns the characters string buffer as a string, then resets it
* for the next element. This is a read-once value, then the value is gone.
* @return
*/
protected String getChars(){
if(characters==null){
return "";
}
String s = characters.toString().trim();
resetChars();
return s;
}
/**
* Indicates that characters are available. This is different from getChars
* because it does not clear the string buffer.
* @return
*/
protected boolean hasChars(){
if(characters==null){
return false;
}
return characters.toString().trim().length() > 0;
}
/**
* Resets the characters string buffer. This is done after parsing an
* element so that tht next element gets a clean buffer to add to.
*/
protected void resetChars(){
characters = null;
}
/**
* Returns the value of the attribute if it is found in the hashmap.
* If not, then returns null.
* @param attrs
* @param name
* @return
*/
protected String getAttribute(Attributes attrs, String name){
for(int i=0;i<attrs.getLength();i++){
String qname = attrs.getQName(i);
if(qname.equals(name)){
return attrs.getValue(i);
}
}
return null;
}
}