/*
* (c) Winterwell Associates Ltd, 2008-2011
* All rights reserved except.
*/
package winterwell.utils.web;
import java.awt.Color;
import java.awt.Desktop;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import winterwell.utils.FailureException;
import winterwell.utils.IORException;
import winterwell.utils.NotUniqueException;
import winterwell.utils.Printer;
import winterwell.utils.Process;
import winterwell.utils.StrUtils;
import winterwell.utils.TodoException;
import winterwell.utils.Utils;
import winterwell.utils.containers.ArrayMap;
import winterwell.utils.containers.Containers;
import winterwell.utils.containers.ITree;
import winterwell.utils.containers.Tree;
import winterwell.utils.gui.GuiUtils;
import winterwell.utils.io.FileUtils;
import winterwell.utils.reporting.Log;
import winterwell.utils.time.TUnit;
/**
* Web and xml-related utils.
*
* @author daniel
* @testedby {@link WebUtilsTest}
*/
public class WebUtils {
private static DocumentBuilderFactory docBuilderFactory;
static final Pattern IP4_ADDRESS = Pattern
.compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}");
/**
* [js, css]
*/
public static final String[] JQUERY_UI_URLS = new String[] {
"http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.5/jquery-ui.min.js",
"http://soda.sh/static/style/jquery-ui-1.8.5.custom.css" // Bleurgh
// - a
// sodash
// url
};
public static final String JQUERY_URL = "http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js";
public static final String MIME_TYPE_HTML = "text/html";
/**
* application/javascript text/javascript is officially obsolete but still
* the most widely used.
*/
public static final String MIME_TYPE_JAVASCRIPT = "application/javascript";
public static final String MIME_TYPE_JSON = "application/json";
public static final String MIME_TYPE_MULTIPART_ALT = "multipart/alternative";
public static final String MIME_TYPE_MULTIPART_MIXED = "multipart/mixed";
public static final String MIME_TYPE_RSS = "application/rss+xml";
/**
* Plain text, utf-8 encoded
*/
public static final String MIME_TYPE_TXT_UTF8 = "text/plain; charset=UTF-8";
public static final String MIME_TYPE_XML = "application/xml";
/**
* Matches an xml comment - including some bad versions
*/
public static final Pattern pComment = Pattern.compile("<!-*.*?-+>",
Pattern.DOTALL);
/**
* Matches a doctype element.
*/
public static final Pattern pDocType = Pattern.compile("<!DOCTYPE.*?>",
Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
/**
* Matches href="whatever" and variants
*/
public static final Pattern pHref = Pattern.compile(
"href=['\"]?([^'\"> \r\n\t]+)['\"]?", Pattern.CASE_INSENSITIVE);
/**
* Used in strip tags to get rid of scripts and css style blocks altogether.
*/
public static final Pattern pScriptOrStyle = Pattern.compile(
"<(script|style)[^<>]*>.+?</(script|style)>",
Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
static SAXParserFactory saxParserFactory = null;
/**
* Matches an xml tag, e.g. <a>, <br/>, or </a>.
*/
public static final Pattern TAG_REGEX = Pattern.compile(
"<(/?[a-zA-Z][a-zA-Z0-9]*)[^>]*>", Pattern.DOTALL);
/**
* Matches urls. Note: Excludes any trailing .
*
* @testedy {@link WebUtilsTest#testUrlRegex()}
*/
public static final Pattern URL_REGEX = Pattern
.compile("[hf]tt?ps?://[a-zA-Z0-9_%\\-\\.,\\?&\\/=\\+'~#!\\*:]+[a-zA-Z0-9_%\\-&\\/=\\+]");
/**
* Note: XPaths are not thread safe, so best to create new ones as needed
*/
public static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();
public static List<Node> asList(final NodeList scripts) {
return new AbstractList<Node>() {
@Override
public Node get(int index) {
return scripts.item(index);
}
@Override
public int size() {
return scripts.getLength();
}
};
}
public static Map<String, String> asMap(NamedNodeMap nnMap) {
int n = nnMap.getLength();
Map<String, String> map = new ArrayMap<String, String>(n);
for (int i = 0; i < n; i++) {
Node item = nnMap.item(i);
String name = item.getNodeName();
String val = item.getNodeValue();
String txt = item.getTextContent();
map.put(name, val);
}
return map;
}
/**
* Encode text so that it can be used as the value of an XML attribute. Does
* not add surrounding quote marks.
*
* @param text
* Can be null, which will return as the empty string.
* @return
*/
public static String attributeEncode(String text) {
if (text == null)
return "";
StringBuilder sb = new StringBuilder(text.length() + 5);
attributeEncode(sb, text);
return sb.toString();
}
/**
* Encode text so that it can be used as the value of an XML attribute. The
* W3 spec is a little hazy on this (no, really). We encode ', " and
* &. All other chars are left alone. Does not add surrounding quote
* marks.
*
* @param helpText
*/
public static void attributeEncode(StringBuilder out, CharSequence text) {
for (int i = 0, len = text.length(); i < len; i++) {
char c = text.charAt(i);
if (c == '\'') {
out.append("'"); // alt ' but NOT '
} else if (c == '"') {
out.append("""); // alt "
} else if (c == '&') {
out.append("&");
} else {
out.append(c);
}
}
}
/**
* Convert a color into an html code
*
* @param col
* @return E.g. "#ff0000" for Color.RED, "rgba(0,255,0,128)" for transparent
* green. Will always use the #hex form if there is no transparency
* (ie alpha=255)
* @see GuiUtils#getColor(String)
*/
public static String color2html(Color col) {
StringBuilder html = new StringBuilder(7);
int r = col.getRed();
int g = col.getGreen();
int b = col.getBlue();
// alpha
int a = col.getAlpha();
if (a != 255) {
float af = a / 255.0f;
// this should work in CSS
html.append("rgba(" + r + "," + g + "," + b + "," + af + ")");
return html.toString();
}
html.append('#');
color2html2_hex(r, html);
color2html2_hex(g, html);
color2html2_hex(b, html);
return html.toString();
}
private static void color2html2_hex(int r, StringBuilder html) {
String hr = Integer.toHexString(r);
if (hr.length() == 1) {
html.append('0');
} else {
assert hr.length() == 2;
}
html.append(hr);
}
/**
* Try to get the server name or IP address for a site. Linux only at
* present!
*
* @param site
* alias or an IP address
* @param returnIP
* If true, return an IPv4 address. If false, look for a
* server-name.
* @return IP or server name. Never null (exception on failure)
* @testedby {@link WebUtilsTest#testGetIPof()} WARNING: this can fail
* sometimes for no good reason that I can see. Retrying usually
* succeeds.
*/
public static String dig(String site, boolean returnIP) {
assert site != null;
if (!Utils.OSisUnix())
throw new TodoException();
// Are you after a reverse lookup for a name?
String x = "";
if (!returnIP && IP4_ADDRESS.matcher(site).matches()) {
x = "-x ";
}
Process p = new Process("dig +short " + x + site);
p.run();
p.waitFor(5000); // this should be fast -- 5 seconds allows aeons of
// time
String out = p.getOutput();
// look for an IPv4 address?
if (returnIP) {
Matcher m = IP4_ADDRESS.matcher(out);
if (m.find())
return m.group();
throw new FailureException("Couldn't find IP address for " + site
+ " in " + out);
}
// look for a name
String[] bits = StrUtils.splitLines(out);
String ip = null;
for (String string : bits) {
if (string.isEmpty()) {
continue;
}
if (IP4_ADDRESS.matcher(string).matches()) {
ip = string;
continue;
}
if (string.endsWith(".")) {
string = string.substring(0, string.length() - 1);
}
return string;
}
// try a reverse lookup
if (ip == null)
throw new FailureException("Couldn't find server name or ip for "
+ site + " in [" + out + "] " + p.getError());
return dig(ip, false);
}
public static void display(File file) {
display(file.toURI());
}
/**
* Open an html page in a web browser.
*
* @param page
* This is an HTML page. It is not a url!
*/
public static void display(String page) {
try {
File f = File.createTempFile("temp", ".html");
FileUtils.write(f, page);
display(f);
// f.deleteOnExit(); bad idea
} catch (IOException e) {
throw new IORException(e);
}
}
/**
* Open a URI in a system web browser.
*
* If {@link GuiUtils#isInteractive()} is false, this will return
* immediately.
*
* FIXME On Ubuntu, this leads to a daemon thread that can prevent programs
* terminating until the browser closes.
*/
public static void display(URI uri) {
if (!GuiUtils.isInteractive())
return;
try {
Desktop d = Desktop.getDesktop();
d.browse(uri);
} catch (UnsupportedOperationException ex) {
// KDE isn't supported :(
if (Utils.getOperatingSystem().contains("linux")
|| Utils.getOperatingSystem().contains("unix")) {
Process p = new Process("xdg-open " + uri);
p.run();
} else
throw ex;
} catch (IOException e) {
throw new IORException(e);
}
}
/**
* @param tag
* E.g. "div" Can be badly formed xml
* @param includeTag
* If false, return just the tag's content text (not the tag or
* the attributes). Extract all instances of tag from the xml
* page. Does NOT cope with nested tags of the same type. Does
* NOT cope with self-closing tags that lack the /> ending
* @return instances of tag. May be empty, never null
*/
public static List<String> extractXmlTags(String tag, String xml,
boolean includeTag) {
// Open-close
Pattern p = Pattern.compile("<" + tag + "(\\s+[^>]*)?>(.*?)</" + tag
+ ">", Pattern.DOTALL);
Matcher m = p.matcher(xml);
List<String> list = new ArrayList<String>();
while (m.find()) {
list.add(includeTag ? m.group() : m.group(2));
}
// One tag (e.g. <img />
if (includeTag) {
p = Pattern.compile("<" + tag + "[^>]*/>");
m = p.matcher(xml);
while (m.find()) {
list.add(m.group());
}
}
return list;
}
public static List<String> extractXmlTagsSelfClosing(String tag, String xml) {
// Open-close
Pattern p = Pattern.compile("<" + tag + "[^>]*>", Pattern.DOTALL
| Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(xml);
List<String> list = new ArrayList<String>();
while (m.find()) {
list.add(m.group());
}
return list;
}
/**
* Convenience for accessing an attribute value (the Node interface is
* rather ugly)
*
* @param attribute
* @param node
* @return value or null
*/
public static String getAttribute(String attribute, Node node) {
NamedNodeMap att = node.getAttributes();
Node w = att.getNamedItem(attribute);
if (w == null)
return null;
return w.getTextContent();
}
/**
* FIXME problems in the following situation:
* getAttribute("href","<a href=abc.com >woo</a>")
*
* Crude XML tag parser: rips the attribute value out using a text scan. NB:
* Attributes must be quoted
*
* @param attribute
* @param tag
* @return attribute-value or null
*/
public static String getAttribute(String attribute, String tag) {
attribute += '=';
int i = tag.indexOf(attribute);
if (i == -1)
return null;
i += attribute.length();
if (i == tag.length())
return null;
char q = tag.charAt(i);
if (q == '"' || q == '\'') {
for (int j = i + 1; j < tag.length(); j++) {
char c = tag.charAt(j);
// FIXME escaping chars
if (c == q)
return tag.substring(i + 1, j);
}
} else {
// unquoted
for (int j = i; j < tag.length(); j++) {
char c = tag.charAt(j);
if (Character.isWhitespace(c))
return tag.substring(i, j);
}
}
throw new IllegalArgumentException(tag + " is not valid xml");
}
/**
* Try to get the IP address(es) of the local machine. Unix only at present.
*
* @return
* @testedby {@link WebUtilsTest#testGetMyIP()}
*/
public static List<String> getMyIP() {
if (Utils.OSisUnix()) {
Process p = new Process("ifconfig");
try {
p.run();
p.waitFor(2000);
String out = p.getOutput();
Matcher m = IP4_ADDRESS.matcher(out);
ArrayList<String> ips = new ArrayList<String>();
while (m.find()) {
ips.add(m.group());
}
return ips;
} catch (Exception e) {
// ifconfig failed?!
Log.report(e + " " + p.getError(), Level.SEVERE);
return new ArrayList();
}
}
throw new TodoException();
}
/**
* The crudest possible http-get.
*
* @see winterwell.web.FakeBrowser Or Apache's http client
* @param url
* @return
*/
public static String getPage(String url) {
try {
HttpURLConnection connection = (HttpURLConnection) new URL(url)
.openConnection();
InputStream in = connection.getInputStream();
String html = FileUtils.read(in);
return html;
} catch (Exception ex) {
throw Utils.runtime(ex);
}
}
/**
* @return an XMLReader. Tries to construct a fault-tolerant fast xml reader
* (e.g. switches off DTD loading). This has implications for
* entities! Which might not get properly supported!
* @testedby {@link WebUtilsTest#testGetXMLReader()}
*/
public static XMLReader getXMLReader() {
if (saxParserFactory == null) {
// Try to avoid using the built-in
// Which will probably be Xerces, which is fussy (chokes on valid
// xml, such as BBC RSS feeds)
// & slow (always downloads the DTD).
for (String klassName : new String[] { "com.bluecast.xml.JAXPSAXParserFactory" // Piccolo
// ,"com.ctc.wstx.sax.WstxSAXParserFactory" // Woodstox
}) {
try {
Class<?> klass = Class.forName(klassName);
saxParserFactory = (SAXParserFactory) klass.newInstance();
} catch (Exception e) {
// oh well
}
}
// oh well - fall back to built-in (Xerces)
if (saxParserFactory == null) {
saxParserFactory = SAXParserFactory.newInstance();
}
// keep it simple
saxParserFactory.setValidating(false);
saxParserFactory.setNamespaceAware(false);
// hopeful switch-offs -- c.f. cf
// http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
for (String badFeature : new String[] {
"http://xml.org/sax/features/namespaces",
"http://xml.org/sax/features/validation",
"http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
"http://apache.org/xml/features/nonvalidating/load-external-dtd" }) {
try {
saxParserFactory.setFeature(badFeature, false);
} catch (Exception e) {
// oh well
}
}
// saxParserFactory.setXIncludeAware(false); // throws an error!
// Default is false anyway
Log.report("Using SAX parser " + saxParserFactory);
}
try {
XMLReader reader = saxParserFactory.newSAXParser().getXMLReader();
// Hopefully switch off DTD -- cf
// http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
reader.setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(String publicId,
String systemId) throws SAXException, IOException {
return new InputSource(new StringReader(""));
// return null; ??
}
});
return reader;
} catch (Exception e) {
throw Utils.runtime(e);
}
}
/**
* @param v
* Can be null (returns null)
* @deprecated Use CGIUtils.htmlEncode() instead (which wraps a Jakarta
* StringEscapeUtils library). This method mainly exists as a
* reminder! It does provide safety encoding: <>s to protect
* against injection attacks
*/
@Deprecated
public static String htmlEncode(String v) throws TodoException {
if (v == null)
return null;
v = v.replace("<", "<");
v = v.replace(">", ">");
return v;
}
public static void main(String[] args) {
if (args == null || args.length == 0) {
Printer.out("Usage: java -jar winterwell.utils.jar COMMAND PARAMS");
Printer.out("Where COMMAND = render, PARAMS = html-input-file pdf-output-file ");
System.exit(0);
}
String cmd = args[0];
assert cmd.equals("render");
String html = FileUtils.read(new File(args[1]));
File pdf = new File(args[2]);
renderToPdf(html, pdf, null);
Printer.out("Rendered pdf at: " + pdf);
}
/**
*
* @param xml
* @param namespaceAware
* @return
* @testedby {@link WebUtilsTest#testParseXml()}
*/
public static Document parseXml(String xml) {
// // ??Pop the first line if its a DTD spec
// // This is to prevent the baked in xerces behaviour of making a web
// call,
// // then throwing an exception unless that web call succeeds
// if (xml.startsWith("<!DOCTYPE")) {
// int i = xml.indexOf('<', 1);
// if (i != -1) {
// xml = xml.substring(i);
// }
// }
// But then we get other exceptions - with undeclared entities :(
// TODO find a decent xml parser
parseXml2_getFactory();
try {
DocumentBuilder builder = docBuilderFactory.newDocumentBuilder();
InputSource input = new InputSource(new StringReader(xml));
Document doc = builder.parse(input);
return doc;
} catch (Exception e) {
Log.report(e + " with " + docBuilderFactory); // why are these
// things broken?
throw Utils.runtime(e);
}
}
private static void parseXml2_getFactory() {
if (docBuilderFactory != null)
return;
// TODO Try to avoid using the built-in!
// Which will probably be Xerces, which is fussy (chokes on valid xml,
// such as BBC RSS feeds)
// & slow (always downloads the DTD).
for (String klassName : new String[] {
// "com.bluecast.xml.JAXPSAXParserFactory" // Piccolo
// ,"com.ctc.wstx.sax.WstxSAXParserFactory" // Woodstox
}) {
try {
Class<?> klass = Class.forName(klassName);
docBuilderFactory = (DocumentBuilderFactory) klass
.newInstance();
} catch (Exception e) {
// oh well
}
}
// oh well - fall back to built-in (Xerces)
if (docBuilderFactory == null) {
docBuilderFactory = DocumentBuilderFactory.newInstance();
}
// keep it simple
docBuilderFactory.setNamespaceAware(false);
docBuilderFactory.setValidating(false);
// hopeful switch-offs -- c.f. cf
// http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
for (String badFeature : new String[] {
"http://xml.org/sax/features/namespaces",
"http://xml.org/sax/features/validation",
"http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
"http://apache.org/xml/features/nonvalidating/load-external-dtd" }) {
try {
docBuilderFactory.setFeature(badFeature, false);
} catch (Exception e) {
// oh well
}
}
// What bit of non-validating doesn't Xerces understand?!
// factory.setXIncludeAware(false); unnecessary and causes errors
Log.report("Using XML parser " + docBuilderFactory);
}
/**
* A lighter-weight alternative to using Document and XPath
*
* @param xml
* @return a tree. The root node has no XMLNode (it is the document
* super-node).
*
* @see CGIUtils#parseHtmlToTree(String)
*/
public static Tree<XMLNode> parseXmlToTree(String xml) {
XMLReader xmlReader = getXMLReader();
XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
xmlReader.setContentHandler(treeBuilder);
try {
xmlReader.parse(new InputSource(new StringReader(xml)));
return treeBuilder.getTree();
} catch (Exception e) {
throw Utils.runtime(e);
}
// // Do it dirty
// treeBuilder = new XmlTreeBuilder();
// DirtyXmlReader xmlReader2 = new DirtyXmlReader();
// xmlReader2.setContentHandler(treeBuilder);
// xmlReader2.parse(new StringReader(xml));
// return treeBuilder.getTree();
}
/**
* This relies on: linux, and wkhtmltopdf being installed and on the path.
*
* @param html
* @param file
* target pdf file
* @param waitFor
* null or "--window-status FLOT_DONE", "--javascript-delay 2000"
* FIXME these options seems to be buggy (Kubuntu, wkhtmltopdf
* 0.11.0rc1)
*
* @testedby {@link WebUtilsTest#testRenderToPdf()}
*/
public static void renderToPdf(String html, File file, String waitFor) {
File temp1 = null;
try {
temp1 = File.createTempFile("page", ".html");
FileUtils.write(temp1, html);
assert temp1.exists();
// 1. Render HTML to PDF with wkhtmltopdf
// The horrendous 7 second delay is to allow time for ajax to run,
// even on a busy server (2 seconds was too little on egan).
Process p = new Process("wkhtmltopdf "
+ (waitFor == null ? "" : waitFor) + " "
+ temp1.getAbsolutePath() + " " + file.getAbsolutePath());
p.setEcho(true);
p.run();
int done = p.waitFor(TUnit.MINUTE.getMillisecs());
if (!file.exists())
throw new IOException("Failed to create " + file + "\t"
+ p.getError());
if (p.getOutput().contains("cannot connect to X server")
|| p.getError().contains("cannot connect to X server"))
throw new IOException(
"render failed: wkhtmltopdf couldn't connect to an X server");
// debug spew
Log.report("html",
"RenderToPdf: " + p.getOutput() + "\t" + p.getError(),
Level.FINE);
} catch (Exception e) {
throw Utils.runtime(e);
} finally {
// clean up
if (temp1 != null) {
FileUtils.delete(temp1);
}
}
}
/**
* This relies on: linux, and wkhtmltopdf and convert (imagemagick) being
* installed
*
* @param html
* @param file
*/
public static void renderToPng(String html, File file) {
File temp1 = null;
try {
temp1 = File.createTempFile("chart", ".pdf");
renderToPdf(html, temp1, "--javascript-delay 1000"); // FIXME
assert temp1.exists() && temp1.length() > 0;
// 2. Render, trim and convert to PNG with convert
Process p2 = new Process("convert -trim -antialias -density 300 "
+ temp1.getAbsolutePath() + " " + file.getAbsolutePath());
p2.run();
p2.waitFor(TUnit.MINUTE.getMillisecs());
if (!file.exists())
throw new IOException("Failed to create " + file + "\t"
+ p2.getError());
} catch (Exception e) {
throw Utils.runtime(e);
} finally {
// clean up
if (temp1 != null) {
FileUtils.delete(temp1);
}
}
}
/**
* Convenience for calling {@link URI#resolve(String)}.
* <p>
* E.g. "http://winterstein.me.uk"+"images" returns
* "http://winterstein.me.uk/images"<br>
* "http://winterstein.me.uk"+"http://google.com" returns
* "http://google.com"<br>
* "http://winterstein.me.uk/text"+"/images" returns
* "http://winterstein.me.uk/images"<br>
* "http://winterstein.me.uk/text"+"images" returns
* "http://winterstein.me.uk/images"<br>
*
* This differs from previous methods because it removes "pages" from the
* base if you don't want the path to be resolved in this (standard) way,
* then it is your responsibility to ensure that a trailing slash is
* present. The exception to this is where there is no path e.g.
* "http://www.google.com" For convenience in these cases a path of "/" will
* be supplied.
*
* @param base
* Typically a website or a directory on a website.
* @param extension
* This may or may not be an extension
* @return
*/
public static URI resolveUri(String base, String extension) {
// if ( ! base.endsWith("/")) base += "/";
URI b = URI(base);
URI ext = URI(extension);
if (Utils.isBlank(b.getPath())) {
b = b.resolve("/");
}
return b.resolve(ext);
}
/**
* Encode text so that it can be used as a string in JavaScript. We encode
* ', " and &. All other chars are left alone. Does not add
* surrounding quote marks.
*
* @param msg
*/
public static String scriptEncode(String msg) {
// ?? Is this correct?
return attributeEncode(msg);
}
/**
* Strip any embedded scripts out of an HTML page. Use this to sanitise user
* content to defend against ajax hacking attacks.
* <p>
* WARNING: this is a bit over-zealous and may mangle some innocent text,
* e.g. "onwards=stuff"
*/
public static String stripScripts(String xml) {
// strip script tags
String noScript = stripTagContents("script", xml);
// strip onXXX handlers (this is a bit over-zealous)
noScript = noScript.replaceAll("\\son\\w+=", "");
// strip href="javascript:..." urls
noScript = noScript.replaceAll("href='?\"?javascript:", "");
return noScript;
}
/**
* Strip any embedded CSS out of an HTML page. Call this <i>before</i>
* stripTags.
*
* @param xml
*/
public static String stripStyle(String xml) {
return stripTagContents("style", xml);
}
/**
* Strip out anything embedded in a "tagName" tag from xml. Does not attempt
* to handle nested tags. Call <i>before</i> calling stripTags.
*/
public static String stripTagContents(String tagName, String xml) {
Pattern re = Pattern.compile("<" + tagName + ".*?</" + tagName + ">",
Pattern.DOTALL);
return re.matcher(xml).replaceAll(" ");
}
/**
* Remove xml and html tags, e.g. to safeguard against javascript injection
* attacks, or to get plain text for NLP.
*
* @param xml
* can be null, in which case null will be returned
* @return the text contents - ie input with all tags removed
* @testedby {@link WebUtilsTest#testStripTags()}
*/
public static String stripTags(String xml) {
if (xml == null)
return null;
// short cut if there are no tags
if (xml.indexOf('<') == -1)
return xml;
// first all the scripts (cos we must remove the tag contents too)
Matcher m4 = pScriptOrStyle.matcher(xml);
xml = m4.replaceAll("");
// comments
Matcher m2 = pComment.matcher(xml);
String txt = m2.replaceAll("");
// now the tags
Matcher m = TAG_REGEX.matcher(txt);
String txt2 = m.replaceAll("");
Matcher m3 = pDocType.matcher(txt2);
String txt3 = m3.replaceAll("");
return txt3;
}
public static URI URI(String uri) {
try {
return new URI(uri);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
public static URI URI(URL url) {
try {
return url.toURI();
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
public static URI URI(URLConnection connection) {
try {
return connection.getURL().toURI();
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
public static String urlDecode(String s) {
try {
s = URLDecoder.decode(s, "UTF-8");
} catch (UnsupportedEncodingException e) {
s = URLDecoder.decode(s);
}
return s;
}
/**
*
* @param vars
* @return query string ripe for appending onto a url. Does not include a ?.
* E.g., you could do "http://mysite.com?" + urlEncoded, if you were
* so inclined.
*/
public static String urlEncode(Map vars) {
StringBuilder encodedData = new StringBuilder();
for (Object key : vars.keySet()) {
Object v = vars.get(key);
String val = urlEncode(v);
encodedData.append(urlEncode(key));
encodedData.append('=');
encodedData.append(val);
encodedData.append('&');
}
return encodedData.toString();
}
/**
* URL encode
*
* @param x
* can be null (returns ""). Will be turned into a String using
* String.valueOf()
* @testedby {@link WebUtilsTest#testUrlEncode()}
*/
public static String urlEncode(Object x) {
if (x == null)
return "";
String s = String.valueOf(x);
try {
s = URLEncoder.encode(s, "UTF-8");
} catch (UnsupportedEncodingException e) {
s = URLEncoder.encode(s);
}
s = s.replace("+", "%20"); // + for " " seems to be out of date.
return s;
}
/**
* Use an xpath query to extract what is expected to be a single string
* valued node. This is a convenience method for a common case.
*
* @param XPATH
* @param node
* @return the resulting node's text content, or null if there is no such
* node.
* @throws NotUniqueException
* if the query returns multiple nodes.
*/
public static String xpathExtractString(String xpathQuery, Node node)
throws NotUniqueException {
List<Node> titles = WebUtils.xpathQuery(xpathQuery, node);
if (titles.isEmpty())
return null;
if (titles.size() != 1)
throw new NotUniqueException(Printer.toString(Containers.subList(
titles, 0, 3)));
Node node2 = titles.get(0);
String text = node2.getTextContent();
// Although W3C say
// "All line-endings reported as a single LF character.", the
// piece-of-shit Apache
// parser that ships with Java disagrees.
text = StrUtils.LINEENDINGS.matcher(text).replaceAll("\n");
return text;
}
/**
* @see #xpathQuery(String, String, boolean)
* @param xpathQuery
* @param node
* @return
* @testedby TODO
*/
public static List<Node> xpathQuery(String xpathQuery, Node node) {
try {
XPathExpression expr = XPATH_FACTORY.newXPath().compile(xpathQuery);
// get a context limited to the node
Node clone = node.cloneNode(true);
NodeList nodeList = (NodeList) expr.evaluate(clone,
XPathConstants.NODESET);
List<Node> nodes = asList(nodeList);
return nodes;
} catch (XPathExpressionException e) {
throw Utils.runtime(e);
}
}
/**
* Convenience for {@link xpathQuery} without namespace aware parsing
*
* @param xpathQuery
* E.g. "//book[author="Joseph Heller"]/title"
*/
public static List<Node> xpathQuery(String xpathQuery, String xml) {
return xpathQuery(xpathQuery, xml, false);
}
/**
* Run an XPath query over an xml document.
* <p>
* <h3>XPath syntax</h3>
* E.g. given the document
*
* <pre>
* <shelf>
* <book year='1960'><title>Catch 22</title><author>Joseph Heller</author></book>
* <book year='2007'><title>The English Swordsman</title><author>Daniel Winterstein</author></book>
* </shelf>
* </pre>
*
* You could have the queries:
*
* <pre>
* "//book[author='Joseph Heller']/title"
* "//book[@year='2007']"
* "/shelf/book/title"
* </pre>
*
* See http://www.zvon.org/xxl/XPathTutorial/General/examples.html for more
* info.
* <p>
* Note: This method is not optimally efficient if the same query is
* repeated, or the same document queried multiple times.
*
* @param xpathQuery
* E.g. "//book[author="Joseph Heller"]/title"
* @param xml
* @param namespaceAware
* @return
*/
public static List<Node> xpathQuery(String xpathQuery, String xml,
boolean namespaceAware) {
// Parse XML
assert !namespaceAware : "TODO";
Document doc = parseXml(xml);
// Build an XPath query
try {
XPath xp = XPATH_FACTORY.newXPath();
XPathExpression expr = xp.compile(xpathQuery);
NodeList nodeList = (NodeList) expr.evaluate(doc,
XPathConstants.NODESET);
List<Node> nodes = asList(nodeList);
return nodes;
} catch (XPathExpressionException e) {
throw Utils.runtime(e);
}
}
}
/**
* Used by {@link WebUtils#parseXml(String)} to build DOM-like trees. Rationale:
* Document & associated classes suck.
*
* <p>
* TODO should we add a filter on nodes, to pre-ignore boring ones?
*
* @author daniel
*/
final class XmlTreeBuilder extends DefaultHandler {
ITree<XMLNode> activeTree;
private boolean endFlag;
private Tree<XMLNode> root;
StringBuilder text = new StringBuilder();
public XmlTreeBuilder() {
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
text.append(ch, start, length);
}
@Override
public void endDocument() throws SAXException {
endFlag = true;
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
// Create a text node?
processTextBuffer();
// Pop node
activeTree = activeTree.getParent();
}
/**
* The root node has no XMLNode (it is the document super-node).
*
* @return
*/
public Tree<XMLNode> getTree() {
assert endFlag;
return root;
}
private void processTextBuffer() {
if (text.length() == 0)
return;
XMLNode textNode = new XMLNode(text.toString(), true);
Tree<XMLNode> textTreeNode = new Tree<XMLNode>(activeTree, textNode);
text = new StringBuilder();
}
@Override
public void skippedEntity(String name) throws SAXException {
// do nothing
}
@Override
public void startDocument() throws SAXException {
root = new Tree<XMLNode>();
activeTree = root;
}
@Override
public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException {
// Create a text node?
processTextBuffer();
// Build node
String tag = Utils.isBlank(localName) ? name : localName;
XMLNode node = new XMLNode(tag);
for (int i = 0, n = atts.getLength(); i < n; i++) {
String aName = atts.getQName(i);
String value = atts.getValue(i);
node.getAttributes().put(aName, value);
}
// New active tree-node
Tree<XMLNode> tree = new Tree<XMLNode>(activeTree, node);
activeTree = tree;
}
}