/*
* Copyright (c) 2005 Henri Sivonen
* Copyright (c) 2007-2013 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.xml;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import nu.validator.httpclient.ssl.PromiscuousSSLProtocolSocketFactory;
import nu.validator.io.BoundedInputStream;
import nu.validator.io.ObservableInputStream;
import nu.validator.io.StreamBoundException;
import nu.validator.io.StreamObserver;
import nu.validator.io.SystemIdIOException;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.log4j.Logger;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import com.hp.hpl.jena.iri.IRI;
import com.hp.hpl.jena.iri.IRIException;
import com.hp.hpl.jena.iri.IRIFactory;
/**
* @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26
* hsivonen Exp $
* @author hsivonen
*/
@SuppressWarnings("deprecation")
public class PrudentHttpEntityResolver implements EntityResolver {
private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class);
private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
private static final HttpClient client = new HttpClient(manager);
private static int maxRequests;
private long sizeLimit;
private final ErrorHandler errorHandler;
private int requestsLeft;
private boolean laxContentType;
private boolean allowRnc = false;
private boolean allowHtml = false;
private boolean allowXhtml = false;
private boolean acceptAllKnownXmlTypes = false;
private boolean allowGenericXml = true;
private final IRIFactory iriFactory;
private final ContentTypeParser contentTypeParser;
static {
if ("true".equals(System.getProperty(
"nu.validator.xml.promiscuous-ssl", "false"))) {
Protocol.registerProtocol("https", new Protocol("https",
new PromiscuousSSLProtocolSocketFactory(), 443));
}
}
/**
* Sets the timeouts of the HTTP client.
*
* @param connectionTimeout
* timeout until connection established in milliseconds. Zero
* means no timeout.
* @param socketTimeout
* timeout for waiting for data in milliseconds. Zero means no
* timeout.
*/
public static void setParams(int connectionTimeout, int socketTimeout,
int maxRequests) {
HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams();
hcmp.setConnectionTimeout(connectionTimeout);
hcmp.setSoTimeout(socketTimeout);
hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION,
maxRequests);
hcmp.setMaxTotalConnections(200); // XXX take this from a property
PrudentHttpEntityResolver.maxRequests = maxRequests;
HttpClientParams hcp = client.getParams();
hcp.setBooleanParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, true);
hcp.setIntParameter(HttpClientParams.MAX_REDIRECTS, 20); // Gecko
// default
}
public static void setUserAgent(String ua) {
client.getParams().setParameter("http.useragent", ua);
}
/**
* @param connectionTimeout
* @param socketTimeout
* @param sizeLimit
*/
public PrudentHttpEntityResolver(long sizeLimit, boolean laxContentType,
ErrorHandler errorHandler) {
this.sizeLimit = sizeLimit;
this.requestsLeft = maxRequests;
this.laxContentType = laxContentType;
this.errorHandler = errorHandler;
this.iriFactory = new IRIFactory();
this.iriFactory.useSpecificationXMLSystemID(true);
this.iriFactory.useSchemeSpecificRules("http", true);
this.iriFactory.useSchemeSpecificRules("https", true);
this.contentTypeParser = new ContentTypeParser(errorHandler,
laxContentType, this.allowRnc, this.allowHtml, this.allowXhtml,
this.acceptAllKnownXmlTypes, this.allowGenericXml);
}
/**
* @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String,
* java.lang.String)
*/
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException {
if (requestsLeft > -1) {
if (requestsLeft == 0) {
throw new IOException(
"Number of permitted HTTP requests exceeded.");
} else {
requestsLeft--;
}
}
GetMethod m = null;
try {
IRI iri;
try {
iri = iriFactory.construct(systemId);
} catch (IRIException e) {
IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
SAXParseException spe = new SAXParseException(e.getMessage(),
publicId, systemId, -1, -1, ioe);
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
if (!iri.isAbsolute()) {
SAXParseException spe = new SAXParseException(
"Not an absolute URI.", publicId, systemId, -1, -1,
new IOException("Not an absolute URI."));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
String scheme = iri.getScheme();
if (!("http".equals(scheme) || "https".equals(scheme))) {
String msg = "Unsupported URI scheme: \u201C" + scheme
+ "\u201D.";
SAXParseException spe = new SAXParseException(msg, publicId,
systemId, -1, -1, new IOException(msg));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
try {
systemId = iri.toASCIIString();
} catch (MalformedURLException e) {
IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
SAXParseException spe = new SAXParseException(e.getMessage(),
publicId, systemId, -1, -1, ioe);
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
try {
m = new GetMethod(systemId);
} catch (IllegalArgumentException e) {
SAXParseException spe = new SAXParseException(
e.getMessage(),
publicId,
systemId,
-1,
-1,
(IOException) new IOException(e.getMessage()).initCause(e));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
m.setFollowRedirects(true);
m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
m.addRequestHeader("Accept", buildAccept());
m.addRequestHeader("Accept-Encoding", "gzip");
log4j.info(systemId);
client.executeMethod(m);
int statusCode = m.getStatusCode();
if (statusCode != 200) {
String msg = "HTTP resource not retrievable. The HTTP status from the remote server was: "
+ statusCode + ".";
SAXParseException spe = new SAXParseException(msg, publicId,
m.getURI().toString(), -1, -1, new IOException(msg));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
long len = m.getResponseContentLength();
if (sizeLimit > -1 && len > sizeLimit) {
SAXParseException spe = new SAXParseException(
"Resource size exceeds limit.",
publicId,
m.getURI().toString(),
-1,
-1,
new StreamBoundException("Resource size exceeds limit."));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
TypedInputSource is;
Header ct = m.getResponseHeader("Content-Type");
String contentType = null;
final String baseUri = m.getURI().toString();
if (ct != null) {
contentType = ct.getValue();
}
is = contentTypeParser.buildTypedInputSource(baseUri, publicId,
contentType);
Header cl = m.getResponseHeader("Content-Language");
if (cl != null) {
is.setLanguage(cl.getValue().trim());
}
Header xuac = m.getResponseHeader("X-UA-Compatible");
if (xuac != null) {
SAXParseException spe = new SAXParseException(
"X-UA-Compatible is a browser-specific HTTP header.",
publicId, systemId, -1, -1);
errorHandler.warning(spe);
}
final GetMethod meth = m;
InputStream stream = m.getResponseBodyAsStream();
if (sizeLimit > -1) {
stream = new BoundedInputStream(stream, sizeLimit, baseUri);
}
Header ce = m.getResponseHeader("Content-Encoding");
if (ce != null) {
String val = ce.getValue().trim();
if ("gzip".equalsIgnoreCase(val)
|| "x-gzip".equalsIgnoreCase(val)) {
stream = new GZIPInputStream(stream);
if (sizeLimit > -1) {
stream = new BoundedInputStream(stream, sizeLimit,
baseUri);
}
}
}
is.setByteStream(new ObservableInputStream(stream,
new StreamObserver() {
private final Logger log4j = Logger.getLogger("nu.validator.xml.PrudentEntityResolver.StreamObserver");
private boolean released = false;
public void closeCalled() {
log4j.debug("closeCalled");
if (!released) {
log4j.debug("closeCalled, not yet released");
released = true;
try {
meth.releaseConnection();
} catch (Exception e) {
log4j.debug(
"closeCalled, releaseConnection", e);
}
}
}
public void exceptionOccurred(Exception ex)
throws IOException {
if (!released) {
released = true;
try {
meth.abort();
} catch (Exception e) {
log4j.debug("exceptionOccurred, abort", e);
} finally {
try {
meth.releaseConnection();
} catch (Exception e) {
log4j.debug(
"exceptionOccurred, releaseConnection",
e);
}
}
}
if (ex instanceof SystemIdIOException) {
SystemIdIOException siie = (SystemIdIOException) ex;
throw siie;
} else if (ex instanceof IOException) {
IOException ioe = (IOException) ex;
throw new SystemIdIOException(baseUri,
ioe.getMessage(), ioe);
} else if (ex instanceof RuntimeException) {
RuntimeException re = (RuntimeException) ex;
throw re;
} else {
throw new RuntimeException(
"API contract violation. Wrong exception type.",
ex);
}
}
public void finalizerCalled() {
if (!released) {
released = true;
try {
meth.abort();
} catch (Exception e) {
log4j.debug("finalizerCalled, abort", e);
} finally {
try {
meth.releaseConnection();
} catch (Exception e) {
log4j.debug(
"finalizerCalled, releaseConnection",
e);
}
}
}
}
}));
return is;
} catch (IOException e) {
if (m != null) {
try {
m.abort();
} catch (Exception ex) {
log4j.debug("abort", ex);
} finally {
try {
m.releaseConnection();
} catch (Exception ex) {
log4j.debug("releaseConnection", ex);
}
}
}
throw e;
} catch (SAXException e) {
if (m != null) {
try {
m.abort();
} catch (Exception ex) {
log4j.debug("abort", ex);
} finally {
try {
m.releaseConnection();
} catch (Exception ex) {
log4j.debug("releaseConnection", ex);
}
}
}
throw e;
} catch (RuntimeException e) {
if (m != null) {
try {
m.abort();
} catch (Exception ex) {
log4j.debug("abort", ex);
} finally {
try {
m.releaseConnection();
} catch (Exception ex) {
log4j.debug("releaseConnection", ex);
}
}
}
throw e;
}
}
/**
* @return Returns the allowRnc.
*/
public boolean isAllowRnc() {
return allowRnc;
}
/**
* @param allowRnc
* The allowRnc to set.
*/
public void setAllowRnc(boolean allowRnc) {
this.allowRnc = allowRnc;
this.contentTypeParser.setAllowRnc(allowRnc);
}
/**
* @param b
*/
public void setAllowHtml(boolean allowHtml) {
this.allowHtml = allowHtml;
this.contentTypeParser.setAllowHtml(allowHtml);
}
/**
* Returns the acceptAllKnownXmlTypes.
*
* @return the acceptAllKnownXmlTypes
*/
public boolean isAcceptAllKnownXmlTypes() {
return acceptAllKnownXmlTypes;
}
/**
* Sets the acceptAllKnownXmlTypes.
*
* @param acceptAllKnownXmlTypes
* the acceptAllKnownXmlTypes to set
*/
public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
this.contentTypeParser.setAcceptAllKnownXmlTypes(acceptAllKnownXmlTypes);
}
/**
* Returns the allowGenericXml.
*
* @return the allowGenericXml
*/
public boolean isAllowGenericXml() {
return allowGenericXml;
}
/**
* Sets the allowGenericXml.
*
* @param allowGenericXml
* the allowGenericXml to set
*/
public void setAllowGenericXml(boolean allowGenericXml) {
this.allowGenericXml = allowGenericXml;
this.contentTypeParser.setAllowGenericXml(allowGenericXml);
}
/**
* Returns the allowXhtml.
*
* @return the allowXhtml
*/
public boolean isAllowXhtml() {
return allowXhtml;
}
/**
* Sets the allowXhtml.
*
* @param allowXhtml
* the allowXhtml to set
*/
public void setAllowXhtml(boolean allowXhtml) {
this.allowXhtml = allowXhtml;
this.contentTypeParser.setAllowXhtml(allowXhtml);
}
private String buildAccept() {
return "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
}
/**
* Returns the allowHtml.
*
* @return the allowHtml
*/
public boolean isAllowHtml() {
return allowHtml;
}
public boolean isOnlyHtmlAllowed() {
return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
}
}