/*
* Copyright 2003-2014 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package groovy.util;
import groovy.util.slurpersupport.GPathResult;
import groovy.util.slurpersupport.Node;
import groovy.util.slurpersupport.NodeChild;
import groovy.util.slurpersupport.NamespaceAwareHashMap;
import groovy.xml.FactorySupport;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Map;
import java.util.Stack;
import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import groovy.xml.QName;
import org.xml.sax.Attributes;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
/**
* Parse XML into a document tree that may be traversed similar to XPath
* expressions. For example:
* <pre>
* def rootNode = new XmlSlurper().parseText(
* '<root><one a1="uno!"/><two>Some text!</two></root>' )
*
* assert rootNode.name() == 'root'
* assert rootNode.one[0].@a1 == 'uno!'
* assert rootNode.two.text() == 'Some text!'
* rootNode.children().each { assert it.name() in ['one','two'] }
* </pre>
* <p>
* Note that in some cases, a 'selector' expression may not resolve to a
* single node. For example:
* <pre>
* def rootNode = new XmlSlurper().parseText(
* '''<root>
* <a>one!</a>
* <a>two!</a>
* </root>''' )
*
* assert rootNode.a.size() == 2
* rootNode.a.each { assert it.text() in ['one!','two!'] }
* </pre>
*
* @author John Wilson
* @see GPathResult
*/
public class XmlSlurper extends DefaultHandler {
private final XMLReader reader;
private Node currentNode = null;
private final Stack<Node> stack = new Stack<Node>();
private final StringBuffer charBuffer = new StringBuffer();
private final Map<String, String> namespaceTagHints = new Hashtable<String, String>();
private boolean keepIgnorableWhitespace = false;
private boolean namespaceAware = false;
/**
* Creates a non-validating and non-namespace-aware <code>XmlSlurper</code> which does not allow DOCTYPE declarations in documents.
*
* @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created.
* @throws SAXException for SAX errors.
*/
public XmlSlurper() throws ParserConfigurationException, SAXException {
this(false, true);
}
/**
* Creates a <code>XmlSlurper</code> which does not allow DOCTYPE declarations in documents.
*
* @param validating <code>true</code> if the parser should validate documents as they are parsed; false otherwise.
* @param namespaceAware <code>true</code> if the parser should provide support for XML namespaces; <code>false</code> otherwise.
*
* @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created.
* @throws SAXException for SAX errors.
*/
public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException {
this(validating, namespaceAware, false);
}
/**
* Creates a <code>XmlSlurper</code>.
*
* @param validating <code>true</code> if the parser should validate documents as they are parsed; false otherwise.
* @param namespaceAware <code>true</code> if the parser should provide support for XML namespaces; <code>false</code> otherwise.
* @param allowDocTypeDeclaration <code>true</code> if the parser should provide support for DOCTYPE declarations; <code>false</code> otherwise.
*
* @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created.
* @throws SAXException for SAX errors.
*/
public XmlSlurper(final boolean validating, final boolean namespaceAware, boolean allowDocTypeDeclaration) throws ParserConfigurationException, SAXException {
SAXParserFactory factory = FactorySupport.createSaxParserFactory();
factory.setNamespaceAware(namespaceAware);
this.namespaceAware = namespaceAware;
factory.setValidating(validating);
setQuietly(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
setQuietly(factory, "http://apache.org/xml/features/disallow-doctype-decl", !allowDocTypeDeclaration);
reader = factory.newSAXParser().getXMLReader();
}
public XmlSlurper(final XMLReader reader) {
this.reader = reader;
}
public XmlSlurper(final SAXParser parser) throws SAXException {
this(parser.getXMLReader());
}
private void setQuietly(SAXParserFactory factory, String feature, boolean value) {
try {
factory.setFeature(feature, value);
}
catch (ParserConfigurationException ignored) { }
catch (SAXNotRecognizedException ignored) { }
catch (SAXNotSupportedException ignored) { }
}
/**
* @deprecated use setKeepIgnorableWhitespace
* @param keepWhitespace If true then whitespace before elements is kept.
* The default is to discard the whitespace.
*/
@Deprecated
public void setKeepWhitespace(boolean keepWhitespace) {
setKeepIgnorableWhitespace(keepWhitespace);
}
/**
* @param keepIgnorableWhitespace If true then ignorable whitespace (i.e. whitespace before elements) is kept.
* The default is to discard the whitespace.
*/
public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace) {
this.keepIgnorableWhitespace = keepIgnorableWhitespace;
}
/**
* @return true if ignorable whitespace is kept
*/
public boolean isKeepIgnorableWhitespace() {
return keepIgnorableWhitespace;
}
/**
* @return The GPathResult instance created by consuming a stream of SAX events
* Note if one of the parse methods has been called then this returns null
* Note if this is called more than once all calls after the first will return null
*/
public GPathResult getDocument() {
try {
// xml namespace is always defined
if (namespaceAware) {
namespaceTagHints.put("xml", "http://www.w3.org/XML/1998/namespace");
}
return new NodeChild(currentNode, null, namespaceTagHints);
} finally {
currentNode = null;
}
}
/**
* Parse the content of the specified input source into a GPathResult object
*
* @param input the InputSource to parse
* @return An object which supports GPath expressions
* @throws SAXException Any SAX exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream
* or character stream supplied by the application.
*/
public GPathResult parse(final InputSource input) throws IOException, SAXException {
reader.setContentHandler(this);
reader.parse(input);
return getDocument();
}
/**
* Parses the content of the given file as XML turning it into a GPathResult object
*
* @param file the File to parse
* @return An object which supports GPath expressions
* @throws SAXException Any SAX exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream
* or character stream supplied by the application.
*/
public GPathResult parse(final File file) throws IOException, SAXException {
final FileInputStream fis = new FileInputStream(file);
final InputSource input = new InputSource(fis);
input.setSystemId("file://" + file.getAbsolutePath());
try {
return parse(input);
} finally {
fis.close();
}
}
/**
* Parse the content of the specified input stream into an GPathResult Object.
* Note that using this method will not provide the parser with any URI
* for which to find DTDs etc. It is up to you to close the InputStream
* after parsing is complete (if required).
*
* @param input the InputStream to parse
* @return An object which supports GPath expressions
* @throws SAXException Any SAX exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream
* or character stream supplied by the application.
*/
public GPathResult parse(final InputStream input) throws IOException, SAXException {
return parse(new InputSource(input));
}
/**
* Parse the content of the specified reader into a GPathResult Object.
* Note that using this method will not provide the parser with any URI
* for which to find DTDs etc. It is up to you to close the Reader
* after parsing is complete (if required).
*
* @param in the Reader to parse
* @return An object which supports GPath expressions
* @throws SAXException Any SAX exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream
* or character stream supplied by the application.
*/
public GPathResult parse(final Reader in) throws IOException, SAXException {
return parse(new InputSource(in));
}
/**
* Parse the content of the specified URI into a GPathResult Object
*
* @param uri a String containing the URI to parse
* @return An object which supports GPath expressions
* @throws SAXException Any SAX exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream
* or character stream supplied by the application.
*/
public GPathResult parse(final String uri) throws IOException, SAXException {
return parse(new InputSource(uri));
}
/**
* A helper method to parse the given text as XML
*
* @param text a String containing XML to parse
* @return An object which supports GPath expressions
* @throws SAXException Any SAX exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream
* or character stream supplied by the application.
*/
public GPathResult parseText(final String text) throws IOException, SAXException {
return parse(new StringReader(text));
}
// Delegated XMLReader methods
//------------------------------------------------------------------------
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#getDTDHandler()
*/
public DTDHandler getDTDHandler() {
return reader.getDTDHandler();
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#getEntityResolver()
*/
public EntityResolver getEntityResolver() {
return reader.getEntityResolver();
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#getErrorHandler()
*/
public ErrorHandler getErrorHandler() {
return reader.getErrorHandler();
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#getFeature(java.lang.String)
*/
public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
return reader.getFeature(uri);
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#getProperty(java.lang.String)
*/
public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
return reader.getProperty(uri);
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
*/
public void setDTDHandler(final DTDHandler dtdHandler) {
reader.setDTDHandler(dtdHandler);
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
*/
public void setEntityResolver(final EntityResolver entityResolver) {
reader.setEntityResolver(entityResolver);
}
/**
* Resolves entities against using the supplied URL as the base for relative URLs
*
* @param base The URL used to resolve relative URLs
*/
public void setEntityBaseUrl(final URL base) {
reader.setEntityResolver(new EntityResolver() {
public InputSource resolveEntity(final String publicId, final String systemId) throws IOException {
return new InputSource(new URL(base, systemId).openStream());
}
});
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
*/
public void setErrorHandler(final ErrorHandler errorHandler) {
reader.setErrorHandler(errorHandler);
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
*/
public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
reader.setFeature(uri, value);
}
/* (non-Javadoc)
* @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
*/
public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
reader.setProperty(uri, value);
}
// ContentHandler interface
//-------------------------------------------------------------------------
/* (non-Javadoc)
* @see org.xml.sax.ContentHandler#startDocument()
*/
public void startDocument() throws SAXException {
currentNode = null;
charBuffer.setLength(0);
}
/* (non-Javadoc)
* @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
*/
public void startPrefixMapping(final String tag, final String uri) throws SAXException {
if (namespaceAware) namespaceTagHints.put(tag, uri);
}
/* (non-Javadoc)
* @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException {
addCdata();
final Map<String, String> attributes = new NamespaceAwareHashMap();
final Map<String, String> attributeNamespaces = new HashMap<String, String>();
for (int i = atts.getLength() - 1; i != -1; i--) {
if (atts.getURI(i).length() == 0) {
attributes.put(atts.getQName(i), atts.getValue(i));
} else {
String key = new QName(atts.getURI(i), atts.getLocalName(i)).toString();
attributes.put(key, atts.getValue(i));
attributeNamespaces.put(key, atts.getURI(i));
}
}
final Node newElement;
if (namespaceURI.length() == 0) {
newElement = new Node(currentNode, qName, attributes, attributeNamespaces, namespaceURI);
} else {
newElement = new Node(currentNode, localName, attributes, attributeNamespaces, namespaceURI);
}
if (currentNode != null) {
currentNode.addChild(newElement);
}
stack.push(currentNode);
currentNode = newElement;
}
public void ignorableWhitespace(char buffer[], int start, int len) throws SAXException {
if (keepIgnorableWhitespace) characters(buffer, start, len);
}
/* (non-Javadoc)
* @see org.xml.sax.ContentHandler#characters(char[], int, int)
*/
public void characters(final char[] ch, final int start, final int length) throws SAXException {
charBuffer.append(ch, start, length);
}
/* (non-Javadoc)
* @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
*/
public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
addCdata();
Node oldCurrentNode = stack.pop();
if (oldCurrentNode != null) {
currentNode = oldCurrentNode;
}
}
/* (non-Javadoc)
* @see org.xml.sax.ContentHandler#endDocument()
*/
public void endDocument() throws SAXException {
}
private void addCdata() {
if (charBuffer.length() != 0) {
//
// This element is preceded by CDATA if keepIgnorableWhitespace is false (the default setting) and
// it's not whitespace add it to the body
// Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
// but for the sort of work I'm doing ignoring the whitespace is preferable
//
final String cdata = charBuffer.toString();
charBuffer.setLength(0);
if (keepIgnorableWhitespace || cdata.trim().length() != 0) {
currentNode.addChild(cdata);
}
}
}
}