package ecar.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.validator.UrlValidator;
/**
* Copyright (c) 2009 Open Lab, http://www.open-lab.com/ Permission is hereby
* granted, free of charge, to any person obtaining a copy of this software and
* associated documentation files (the "Software"), to deal in the Software
* without restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
public class HtmlSanitizer {
public static Pattern forbiddenTags = Pattern
.compile("^(script|object|embed|link|style|form|input|font)$");
public static Pattern allowedTags = Pattern
.compile("^(b|p|i|s|a|img|table|thead|tbody|tfoot|tr|th|td|dd|dl|dt|em|h1|h2|h3|h4|h5|h6|li|ul|ol|span|div|strike|strong|"
+ "sub|sup|pre|del|code|blockquote|strike|kbd|br|hr|area|map|object|embed|param|link|form|small|big)$");
private static Pattern commentPattern = Pattern.compile("<!--.*"); // <!--.........>
private static Pattern tagStartPattern = Pattern
.compile("<(?i)(\\w+\\b)\\s*(.*)/?>$"); // <tag ....props.....>
private static Pattern tagClosePattern = Pattern
.compile("</(?i)(\\w+\\b)\\s*>$"); // </tag .........>
private static Pattern standAloneTags = Pattern.compile("^(img|br|hr)$");
private static Pattern selfClosed = Pattern.compile("<.+/>");
private static Pattern attributesPattern = Pattern
.compile("(\\w*)\\s*=\\s*\"([^\"]*)\""); // prop="...."
private static Pattern stylePattern = Pattern
.compile("([^\\s^:]+)\\s*:\\s*([^;]+);?"); // color:red;
private static Pattern urlStylePattern = Pattern
.compile("(?i).*\\b\\s*url\\s*\\(['\"]([^)]*)['\"]\\)"); // url('....')"
public static Pattern forbiddenStylePattern = Pattern
.compile("(?:(expression|eval|javascript))\\s*\\("); // expression(....)"
// thanks to
// Ben
// Summer
/**
* This method should be used to test input.
*
* @param html
* @return true if the input is "valid"
*/
public static boolean isSanitized(String html) {
return sanitizer(html).isValid;
}
/**
* Used to clean every html before to output it in any html page
*
* @param html
* @return sanitized html
*/
public static String sanitize(String html) {
return sanitizer(html).html;
}
/**
* Used to get the text, tags removed or encoded
*
* @param html
* @return sanitized text
*/
public static String getText(String html) {
return sanitizer(html).text;
}
/**
* This is the main method of sanitizing. It will be used both for
* validation and cleaning
*
* @param html
* @return a SanitizeResult object
*/
public static SanitizeResult sanitizer(String html) {
return sanitizer(html, allowedTags, forbiddenTags);
}
public static SanitizeResult sanitizer(String html, Pattern allowedTags,
Pattern forbiddenTags) {
SanitizeResult ret = new SanitizeResult();
Stack<String> openTags = new Stack();
List<String> tokens = tokenize(html);
// ------------------- LOOP for every token --------------------------
for (String token : tokens) {
boolean isAcceptedToken = false;
Matcher startMatcher = tagStartPattern.matcher(token);
Matcher endMatcher = tagClosePattern.matcher(token);
// --------------------------------------------------------------------------------
// COMMENT <!-- ......... -->
if (commentPattern.matcher(token).find()) {
ret.val = ret.val + token
+ (token.endsWith("-->") ? "" : "-->");
ret.invalidTags.add(token
+ (token.endsWith("-->") ? "" : "-->"));
continue;
// --------------------------------------------------------------------------------
// OPEN TAG <tag .........>
} else if (startMatcher.find()) {
// tag name extraction
String tag = startMatcher.group(1).toLowerCase();
// -----------------------------------------------------
// FORBIDDEN TAG <script .........>
if (forbiddenTags.matcher(tag).find()) {
ret.invalidTags.add("<" + tag + ">");
continue;
// -------------------------------------------------- WELL
// KNOWN TAG
} else if (allowedTags.matcher(tag).find()) {
String cleanToken = "<" + tag;
String tokenBody = startMatcher.group(2);
// first test table consistency
// table tbody tfoot thead th tr td
if ("thead".equals(tag) || "tbody".equals(tag)
|| "tfoot".equals(tag) || "tr".equals(tag)) {
if (openTags.search("table") < 1) {
ret.invalidTags.add("<" + tag + ">");
continue;
}
} else if ("td".equals(tag) || "th".equals(tag)) {
if (openTags.search("tr") < 1) {
ret.invalidTags.add("<" + tag + ">");
continue;
}
}
// then test properties
Matcher attributes = attributesPattern.matcher(tokenBody);
boolean foundURL = false; // URL flag
while (attributes.find()) {
String attr = attributes.group(1).toLowerCase();
String val = attributes.group(2);
// we will accept href in case of <A>
if ("a".equals(tag) && "href".equals(attr)) { // <a
// href="......">
String[] customSchemes = { "http", "https" };
if (new UrlValidator(customSchemes).isValid(val)) {
foundURL = true;
} else {
// may be it is a mailto?
// case <a
// href="mailto:pippo@pippo.com?subject=...."
if (val.toLowerCase().startsWith("mailto:")
&& val.indexOf("@") >= 0) {
String val1 = "http://www."
+ val
.substring(val.indexOf("@") + 1);
if (new UrlValidator(customSchemes)
.isValid(val1)) {
foundURL = true;
} else {
ret.invalidTags.add(attr + " " + val);
val = "";
}
} else {
ret.invalidTags.add(attr + " " + val);
val = "";
}
}
} else if (tag.matches("img|embed")
&& "src".equals(attr)) { // <img src="......">
String[] customSchemes = { "http", "https" };
if (new UrlValidator(customSchemes).isValid(val)) {
foundURL = true;
} else {
ret.invalidTags.add(attr + " " + val);
val = "";
}
} else if ("href".equals(attr) || "src".equals(attr)) { // <tag
// src/href="......">
// skipped
ret.invalidTags.add(tag + " " + attr + " " + val);
continue;
} else if (attr.matches("width|height")) { // <tag
// width/height="......">
if (!val.toLowerCase().matches("\\d+%|\\d+$")) { // test
// numeric
// values
ret.invalidTags.add(tag + " " + attr + " "
+ val);
continue;
}
} else if ("style".equals(attr)) { // <tag
// style="......">
// then test properties
Matcher styles = stylePattern.matcher(val);
String cleanStyle = "";
while (styles.find()) {
String styleName = styles.group(1)
.toLowerCase();
String styleValue = styles.group(2);
// suppress invalid styles values
if (forbiddenStylePattern.matcher(styleValue)
.find()) {
ret.invalidTags.add(tag + " " + attr + " "
+ styleValue);
continue;
}
// check if valid url
Matcher urlStyleMatcher = urlStylePattern
.matcher(styleValue);
if (urlStyleMatcher.find()) {
String[] customSchemes = { "http", "https" };
String url = urlStyleMatcher.group(1);
if (!new UrlValidator(customSchemes)
.isValid(url)) {
ret.invalidTags.add(tag + " " + attr
+ " " + styleValue);
continue;
}
}
cleanStyle = cleanStyle + styleName + ":"
+ encode(styleValue) + ";";
}
val = cleanStyle;
} else if (attr.startsWith("on")) { // skip all
// javascript events
ret.invalidTags.add(tag + " " + attr + " " + val);
continue;
} else { // by default encode all properies
val = encode(val);
}
cleanToken = cleanToken + " " + attr + "=\"" + val
+ "\"";
}
cleanToken = cleanToken + ">";
isAcceptedToken = true;
// for <img> and <a>
if (tag.matches("a|img|embed") && !foundURL) {
isAcceptedToken = false;
cleanToken = "";
}
token = cleanToken;
// push the tag if require closure and it is accepted
// (otherwirse is encoded)
if (isAcceptedToken
&& !(standAloneTags.matcher(tag).find() || selfClosed
.matcher(tag).find()))
openTags.push(tag);
// --------------------------------------------------------------------------------
// UNKNOWN TAG
} else {
ret.invalidTags.add(token);
ret.val = ret.val + token;
continue;
}
// --------------------------------------------------------------------------------
// CLOSE TAG </tag>
} else if (endMatcher.find()) {
String tag = endMatcher.group(1).toLowerCase();
// is self closing
if (selfClosed.matcher(tag).find()) {
ret.invalidTags.add(token);
continue;
}
if (forbiddenTags.matcher(tag).find()) {
ret.invalidTags.add("/" + tag);
continue;
}
if (!allowedTags.matcher(tag).find()) {
ret.invalidTags.add(token);
ret.val = ret.val + token;
continue;
} else {
String cleanToken = "";
// check tag position in the stack
int pos = openTags.search(tag);
// if found on top ok
for (int i = 1; i <= pos; i++) {
// pop all elements before tag and close it
String poppedTag = openTags.pop();
cleanToken = cleanToken + "</" + poppedTag + ">";
isAcceptedToken = true;
}
token = cleanToken;
}
}
ret.val = ret.val + token;
if (isAcceptedToken) {
ret.html = ret.html + token;
// ret.text = ret.text + " ";
} else {
String sanToken = htmlEncodeApexesAndTags(token);
ret.html = ret.html + sanToken;
ret.text = ret.text
+ htmlEncodeApexesAndTags(removeLineFeed(token));
}
}
// must close remaining tags
while (openTags.size() > 0) {
// pop all elements before tag and close it
String poppedTag = openTags.pop();
ret.html = ret.html + "</" + poppedTag + ">";
ret.val = ret.val + "</" + poppedTag + ">";
}
// set boolean value
ret.isValid = ret.invalidTags.size() == 0;
return ret;
}
/**
* Splits html tag and tag content <......>.
*
* @param html
* @return a list of token
*/
private static List<String> tokenize(String html) {
ArrayList tokens = new ArrayList();
int pos = 0;
String token = "";
int len = html.length();
while (pos < len) {
char c = html.charAt(pos);
String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4);
// a comment is starting
if ("<!--".equals(ahead)) {
// store the current token
if (token.length() > 0)
tokens.add(token);
// clear the token
token = "";
// serch the end of <......>
int end = moveToMarkerEnd(pos, "-->", html);
tokens.add(html.substring(pos, end));
pos = end;
// a new "<" token is starting
} else if ('<' == c) {
// store the current token
if (token.length() > 0)
tokens.add(token);
// clear the token
token = "";
// serch the end of <......>
int end = moveToMarkerEnd(pos, ">", html);
tokens.add(html.substring(pos, end));
pos = end;
} else {
token = token + c;
pos++;
}
}
// store the last token
if (token.length() > 0)
tokens.add(token);
return tokens;
}
private static int moveToMarkerEnd(int pos, String marker, String s) {
int i = s.indexOf(marker, pos);
if (i > -1)
pos = i + marker.length();
else
pos = s.length();
return pos;
}
/**
* Contains the sanitizing results. html is the sanitized html encoded ready
* to be printed. Unaccepted tag are encode, text inside tag is always
* encoded MUST BE USED WHEN PRINTING HTML text is the text inside valid
* tags. Contains invalid tags encoded SHOULD BE USED TO PRINT EXCERPTS val
* is the html source cleaned from unaccepted tags. It is not encoded:
* SHOULD BE USED IN SAVE ACTIONS isValid is true when every tag is accepted
* without forcing encoding invalidTags is the list of encoded-killed tags
*/
public static class SanitizeResult {
public String html = "";
public String text = "";
public String val = "";
public boolean isValid = true;
public List<String> invalidTags = new ArrayList();
}
public static String encode(String s) {
return convertLineFeedToBR(htmlEncodeApexesAndTags(s == null ? "" : s));
}
public static final String htmlEncodeApexesAndTags(String source) {
return htmlEncodeTag(htmlEncodeApexes(source));
}
public static final String htmlEncodeApexes(String source) {
if (source != null) {
String result = replaceAllNoRegex(source,
new String[] { "\"", "'" }, new String[] { """,
"'" });
return result;
} else
return null;
}
public static final String htmlEncodeTag(String source) {
if (source != null) {
String result = replaceAllNoRegex(source,
new String[] { "<", ">" }, new String[] { "<", ">" });
return result;
} else
return null;
}
public static String convertLineFeedToBR(String text) {
if (text != null)
return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" },
new String[] { "<br>", "<br>", " " });
else
return null;
}
public static String removeLineFeed(String text) {
if (text != null)
return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" },
new String[] { " ", " ", " " });
else
return null;
}
public static final String replaceAllNoRegex(String source,
String searches[], String replaces[]) {
int k;
String tmp = source;
for (k = 0; k < searches.length; k++)
tmp = replaceAllNoRegex(tmp, searches[k], replaces[k]);
return tmp;
}
public static final String replaceAllNoRegex(String source, String search,
String replace) {
StringBuffer buffer = new StringBuffer();
if (source != null) {
if (search.length() == 0)
return source;
int oldPos, pos;
for (oldPos = 0, pos = source.indexOf(search, oldPos); pos != -1; oldPos = pos
+ search.length(), pos = source.indexOf(search, oldPos)) {
buffer.append(source.substring(oldPos, pos));
buffer.append(replace);
}
if (oldPos < source.length())
buffer.append(source.substring(oldPos));
}
return new String(buffer);
}
}