package org.pdf4j.saxon.event;
import org.pdf4j.saxon.tinytree.CompressedWhitespace;
import org.pdf4j.saxon.trans.XPathException;
import org.pdf4j.saxon.value.Whitespace;
import javax.xml.transform.OutputKeys;
* This class generates HTML output
* @author Michael H. Kay
public class HTMLEmitter extends XMLEmitter {
* Preferred character representations
protected static final int REP_NATIVE = 0;
protected static final int REP_ENTITY = 1;
protected static final int REP_DECIMAL = 2;
protected static final int REP_HEX = 3;
protected int nonASCIIRepresentation = REP_NATIVE;
protected int excludedRepresentation = REP_ENTITY;
private int inScript;
private boolean started = false;
private int version = 4;
private String elementName;
private short uriCode;
* Decode preferred representation
* @param rep string containing preferred representation (native, entity, decimal, or hex)
* @return integer code for the preferred representation
private static int representationCode(String rep) {
if (rep.equalsIgnoreCase("native")) return REP_NATIVE;
if (rep.equalsIgnoreCase("entity")) return REP_ENTITY;
if (rep.equalsIgnoreCase("decimal")) return REP_DECIMAL;
if (rep.equalsIgnoreCase("hex")) return REP_HEX;
return REP_ENTITY;
* Table of HTML tags that have no closing tag
static HTMLTagHashSet emptyTags = new HTMLTagHashSet(31);
static {
private static void setEmptyTag(String tag) {
protected static boolean isEmptyTag(String tag) {
return emptyTags.contains(tag);
* Table of boolean attributes
// we use two HashMaps to avoid unnecessary string concatenations
private static HTMLTagHashSet booleanAttributes = new HTMLTagHashSet(31);
private static HTMLTagHashSet booleanCombinations = new HTMLTagHashSet(53);
static {
setBooleanAttribute("area", "nohref");
setBooleanAttribute("button", "disabled");
setBooleanAttribute("dir", "compact");
setBooleanAttribute("dl", "compact");
setBooleanAttribute("frame", "noresize");
setBooleanAttribute("hr", "noshade");
setBooleanAttribute("img", "ismap");
setBooleanAttribute("input", "checked");
setBooleanAttribute("input", "disabled");
setBooleanAttribute("input", "readonly");
setBooleanAttribute("menu", "compact");
setBooleanAttribute("object", "declare");
setBooleanAttribute("ol", "compact");
setBooleanAttribute("optgroup", "disabled");
setBooleanAttribute("option", "selected");
setBooleanAttribute("option", "disabled");
setBooleanAttribute("script", "defer");
setBooleanAttribute("select", "multiple");
setBooleanAttribute("select", "disabled");
setBooleanAttribute("td", "nowrap");
setBooleanAttribute("textarea", "disabled");
setBooleanAttribute("textarea", "readonly");
setBooleanAttribute("th", "nowrap");
setBooleanAttribute("ul", "compact");
private static void setBooleanAttribute(String element, String attribute) {
booleanCombinations.add(element + '+' + attribute);
private static boolean isBooleanAttribute(String element, String attribute, String value) {
return attribute.equalsIgnoreCase(value) &&
booleanAttributes.contains(attribute) &&
booleanCombinations.contains(element + '+' + attribute);
* Constructor
public HTMLEmitter() {
* Output start of document
public void open() throws XPathException {}
protected void openDocument() throws XPathException {
if (writer==null) {
if (started) return;
started = true;
// This method is sometimes called twice, especially during an identity transform
// This check stops two DOCTYPE declarations being output.
String versionProperty = outputProperties.getProperty(OutputKeys.VERSION);
if (versionProperty != null) {
if (versionProperty.equals("4.0") || versionProperty.equals("4.01")) {
version = 4;
} else if (versionProperty.equals("5.0")) {
version = 5;
} else {
XPathException err = new XPathException("Unsupported HTML version: " + versionProperty);
throw err;
String byteOrderMark = outputProperties.getProperty(SaxonOutputKeys.BYTE_ORDER_MARK);
if ("yes".equals(byteOrderMark) &&
"UTF-8".equalsIgnoreCase(outputProperties.getProperty(OutputKeys.ENCODING))) {
try {
} catch ( err) {
// Might be an encoding exception; just ignore it
String systemId = outputProperties.getProperty(OutputKeys.DOCTYPE_SYSTEM);
String publicId = outputProperties.getProperty(OutputKeys.DOCTYPE_PUBLIC);
// Treat "" as equivalent to absent. This goes beyond what the spec strictly allows.
if ("".equals(systemId)) {
systemId = null;
if ("".equals(publicId)) {
publicId = null;
if (systemId!=null || publicId!=null || version==5) {
writeDocType("html", systemId, publicId);
empty = false;
inScript = -1000000;
// Handle saxon:character-representation
String representation = outputProperties.getProperty(
if (representation != null) {
String nonASCIIrep;
String excludedRep;
int semi = representation.indexOf(';');
if (semi < 0) {
nonASCIIrep = Whitespace.trim(representation);
excludedRep = nonASCIIrep;
} else {
nonASCIIrep = Whitespace.trim(representation.substring(0, semi));
excludedRep = Whitespace.trim(representation.substring(semi+1));
nonASCIIRepresentation = representationCode(nonASCIIrep);
excludedRepresentation = representationCode(excludedRep);
if (excludedRepresentation == REP_NATIVE) {
excludedRepresentation = REP_ENTITY;
* Output the document type declaration
* @param type The element name
* @param systemId The DOCTYP system identifier
* @param publicId The DOCTYPE public identifier
protected void writeDocType(String type, String systemId, String publicId) throws XPathException {
if (version == 5) {
try {
writer.write("<!DOCTYPE HTML>\n");
} catch ( err) {
throw new XPathException(err);
} else {
super.writeDocType(type, systemId, publicId);
* Output element start tag
public void startElement(int nameCode, int typeCode, int locationId, int properties) throws XPathException {
super.startElement(nameCode, typeCode, locationId, properties);
uriCode = namePool.getURICode(nameCode);
elementName = (String)elementStack.peek();
if (uriCode==0 &&
( elementName.equalsIgnoreCase("script") ||
elementName.equalsIgnoreCase("style"))) {
inScript = 0;
public void startContent() throws XPathException {
closeStartTag(); // prevent <xxx/> syntax
* Write attribute name=value pair. Overrides the XML behaviour if the name and value
* are the same (we assume this is a boolean attribute to be minimised), or if the value is
* a URL.
protected void writeAttribute(int elCode, String attname, CharSequence value, int properties) throws XPathException {
try {
if (uriCode==0) {
if (isBooleanAttribute(elementName, attname, value.toString())) {
super.writeAttribute(elCode, attname, value, properties);
} catch ( err) {
throw new XPathException(err);
* Escape characters. Overrides the XML behaviour
protected void writeEscape(final CharSequence chars, final boolean inAttribute)
throws, XPathException {
int segstart = 0;
final boolean[] specialChars = (inAttribute ? specialInAtt : specialInText);
if (chars instanceof CompressedWhitespace) {
((CompressedWhitespace)chars).writeEscape(specialChars, writer);
boolean disabled = false;
while (segstart < chars.length()) {
int i = segstart;
// find a maximal sequence of "ordinary" characters
if (nonASCIIRepresentation == REP_NATIVE) {
char c;
while (i < chars.length() &&
((c = chars.charAt(i)) < 127 ? !specialChars[c] : (characterSet.inCharset(c) && c > 160)
) {
} else {
char c;
while (i < chars.length() && (c = chars.charAt(i)) < 127 && !specialChars[c]) {
// if this was the whole string, output the string and quit
if (i == chars.length()) {
if (segstart == 0) {
} else {
writeCharSequence(chars.subSequence(segstart, i));
// otherwise, output this sequence and continue
if (i > segstart) {
writeCharSequence(chars.subSequence(segstart, i));
final char c = chars.charAt(i);
if (c==0) {
// used to switch escaping on and off
disabled = !disabled;
} else if (disabled) {
} else if (c<=127) {
// handle a special ASCII character
if (inAttribute) {
if (c=='<') {
writer.write('<'); // not escaped
} else if (c=='>') {
writer.write(">"); // recommended for older browsers
} else if (c=='&') {
if (i+1<chars.length() && chars.charAt(i+1)=='{') {
writer.write('&'); // not escaped if followed by '{'
} else {
} else if (c=='\"') {
} else if (c=='\n') {
} else if (c=='\t') {
writer.write("	");
} else if (c=='\r') {
} else {
if (c=='<') {
} else if (c=='>') {
writer.write(">"); // changed to allow for "]]>"
} else if (c=='&') {
} else if (c=='\r') {
} else if (c==160) {
// always output NBSP as an entity reference
writer.write(" ");
} else if (c>=127 && c<160) {
// these control characters are illegal in HTML
XPathException err = new XPathException("Illegal HTML character: decimal " + (int)c);
throw err;
} else if (c>=55296 && c<=56319) { //handle surrogate pair
//A surrogate pair is two consecutive Unicode characters. The first
//is in the range D800 to DBFF, the second is in the range DC00 to DFFF.
//To compute the numeric value of the character corresponding to a surrogate
//pair, use this formula (all numbers are hex):
//(FirstChar - D800) * 400 + (SecondChar - DC00) + 10000
// we'll trust the data to be sound
int charval = (((int)c - 55296) * 1024) + ((int)chars.charAt(i+1) - 56320) + 65536;
} else if (characterSet.inCharset(c)) {
switch(nonASCIIRepresentation) {
if (c>160 && c<=255) {
// if chararacter in iso-8859-1, use an entity reference
// else fall through
preferHex = false;
case REP_HEX:
preferHex = true;
// fall through
} else {
// Character not present in encoding
switch(excludedRepresentation) {
if (c>160 && c<=255) {
// if chararacter in iso-8859-1, use an entity reference
// else fall through
preferHex = false;
case REP_HEX:
preferHex = true;
// fall through
segstart = ++i;
* Output an element end tag.
public void endElement() throws XPathException {
String name = (String)elementStack.peek();
if (inScript==0) {
inScript = -1000000;
if (isEmptyTag(name) && uriCode==0) {
// no end tag required
} else {
* Character data.
public void characters (CharSequence chars, int locationId, int properties)
throws XPathException {
int options = properties;
if (inScript>0) {
options |= ReceiverOptions.DISABLE_ESCAPING;
super.characters(chars, locationId, options);
* Handle a processing instruction.
public void processingInstruction (String target, CharSequence data, int locationId, int properties)
throws XPathException
if (empty) {
for (int i=0; i<data.length(); i++) {
if (data.charAt(i) == '>') {
XPathException err = new XPathException("A processing instruction in HTML must not contain a > character");
throw err;
try {
writer.write(' ');
} catch ( err) {
throw new XPathException(err);
protected static final String[] latin1Entities = {
"nbsp", // " " -- no-break space = non-breaking space,
// U+00A0 ISOnum -->
"iexcl", // "¡" -- inverted exclamation mark, U+00A1 ISOnum -->
"cent", // "¢" -- cent sign, U+00A2 ISOnum -->
"pound", // "£" -- pound sign, U+00A3 ISOnum -->
"curren", // "¤" -- currency sign, U+00A4 ISOnum -->
"yen", // "¥" -- yen sign = yuan sign, U+00A5 ISOnum -->
"brvbar", // "¦" -- broken bar = broken vertical bar,
// U+00A6 ISOnum -->
"sect", // "§" -- section sign, U+00A7 ISOnum -->
"uml", // "¨" -- diaeresis = spacing diaeresis,
// U+00A8 ISOdia -->
"copy", // "©" -- copyright sign, U+00A9 ISOnum -->
"ordf", // "ª" -- feminine ordinal indicator, U+00AA ISOnum -->
"laquo", // "«" -- left-pointing double angle quotation mark
// = left pointing guillemet, U+00AB ISOnum -->
"not", // "¬" -- not sign, U+00AC ISOnum -->
"shy", // "­" -- soft hyphen = discretionary hyphen,
// U+00AD ISOnum -->
"reg", // "®" -- registered sign = registered trade mark sign,
// U+00AE ISOnum -->
"macr", // "¯" -- macron = spacing macron = overline
// = APL overbar, U+00AF ISOdia -->
"deg", // "°" -- degree sign, U+00B0 ISOnum -->
"plusmn", // "±" -- plus-minus sign = plus-or-minus sign,
// U+00B1 ISOnum -->
"sup2", // "²" -- superscript two = superscript digit two
// = squared, U+00B2 ISOnum -->
"sup3", // "³" -- superscript three = superscript digit three
// = cubed, U+00B3 ISOnum -->
"acute", // "´" -- acute accent = spacing acute,
// U+00B4 ISOdia -->
"micro", // "µ" -- micro sign, U+00B5 ISOnum -->
"para", // "¶" -- pilcrow sign = paragraph sign,
// U+00B6 ISOnum -->
"middot", // "·" -- middle dot = Georgian comma
// = Greek middle dot, U+00B7 ISOnum -->
"cedil", // "¸" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
"sup1", // "¹" -- superscript one = superscript digit one,
// U+00B9 ISOnum -->
"ordm", // "º" -- masculine ordinal indicator,
// U+00BA ISOnum -->
"raquo", // "»" -- right-pointing double angle quotation mark
// = right pointing guillemet, U+00BB ISOnum -->
"frac14", // "¼" -- vulgar fraction one quarter
// = fraction one quarter, U+00BC ISOnum -->
"frac12", // "½" -- vulgar fraction one half
// = fraction one half, U+00BD ISOnum -->
"frac34", // "¾" -- vulgar fraction three quarters
// = fraction three quarters, U+00BE ISOnum -->
"iquest", // "¿" -- inverted question mark
// = turned question mark, U+00BF ISOnum -->
"Agrave", // "À" -- latin capital letter A with grave
// = latin capital letter A grave,
// U+00C0 ISOlat1 -->
"Aacute", // "Á" -- latin capital letter A with acute,
// U+00C1 ISOlat1 -->
"Acirc", // "Â" -- latin capital letter A with circumflex,
// U+00C2 ISOlat1 -->
"Atilde", // "Ã" -- latin capital letter A with tilde,
// U+00C3 ISOlat1 -->
"Auml", // "Ä" -- latin capital letter A with diaeresis,
// U+00C4 ISOlat1 -->
"Aring", // "Å" -- latin capital letter A with ring above
// = latin capital letter A ring,
// U+00C5 ISOlat1 -->
"AElig", // "Æ" -- latin capital letter AE
// = latin capital ligature AE,
// U+00C6 ISOlat1 -->
"Ccedil", // "Ç" -- latin capital letter C with cedilla,
// U+00C7 ISOlat1 -->
"Egrave", // "È" -- latin capital letter E with grave,
// U+00C8 ISOlat1 -->
"Eacute", // "É" -- latin capital letter E with acute,
// U+00C9 ISOlat1 -->
"Ecirc", // "Ê" -- latin capital letter E with circumflex,
// U+00CA ISOlat1 -->
"Euml", // "Ë" -- latin capital letter E with diaeresis,
// U+00CB ISOlat1 -->
"Igrave", // "Ì" -- latin capital letter I with grave,
// U+00CC ISOlat1 -->
"Iacute", // "Í" -- latin capital letter I with acute,
// U+00CD ISOlat1 -->
"Icirc", // "Î" -- latin capital letter I with circumflex,
// U+00CE ISOlat1 -->
"Iuml", // "Ï" -- latin capital letter I with diaeresis,
// U+00CF ISOlat1 -->
"ETH", // "Ð" -- latin capital letter ETH, U+00D0 ISOlat1 -->
"Ntilde", // "Ñ" -- latin capital letter N with tilde,
// U+00D1 ISOlat1 -->
"Ograve", // "Ò" -- latin capital letter O with grave,
// U+00D2 ISOlat1 -->
"Oacute", // "Ó" -- latin capital letter O with acute,
// U+00D3 ISOlat1 -->
"Ocirc", // "Ô" -- latin capital letter O with circumflex,
// U+00D4 ISOlat1 -->
"Otilde", // "Õ" -- latin capital letter O with tilde,
// U+00D5 ISOlat1 -->
"Ouml", // "Ö" -- latin capital letter O with diaeresis,
// U+00D6 ISOlat1 -->
"times", // "×" -- multiplication sign, U+00D7 ISOnum -->
"Oslash", // "Ø" -- latin capital letter O with stroke
// = latin capital letter O slash,
// U+00D8 ISOlat1 -->
"Ugrave", // "Ù" -- latin capital letter U with grave,
// U+00D9 ISOlat1 -->
"Uacute", // "Ú" -- latin capital letter U with acute,
// U+00DA ISOlat1 -->
"Ucirc", // "Û" -- latin capital letter U with circumflex,
// U+00DB ISOlat1 -->
"Uuml", // "Ü" -- latin capital letter U with diaeresis,
// U+00DC ISOlat1 -->
"Yacute", // "Ý" -- latin capital letter Y with acute,
// U+00DD ISOlat1 -->
"THORN", // "Þ" -- latin capital letter THORN,
// U+00DE ISOlat1 -->
"szlig", // "ß" -- latin small letter sharp s = ess-zed,
// U+00DF ISOlat1 -->
"agrave", // "à" -- latin small letter a with grave
// = latin small letter a grave,
// U+00E0 ISOlat1 -->
"aacute", // "á" -- latin small letter a with acute,
// U+00E1 ISOlat1 -->
"acirc", // "â" -- latin small letter a with circumflex,
// U+00E2 ISOlat1 -->
"atilde", // "ã" -- latin small letter a with tilde,
// U+00E3 ISOlat1 -->
"auml", // "ä" -- latin small letter a with diaeresis,
// U+00E4 ISOlat1 -->
"aring", // "å" -- latin small letter a with ring above
// = latin small letter a ring,
// U+00E5 ISOlat1 -->
"aelig", // "æ" -- latin small letter ae
// = latin small ligature ae, U+00E6 ISOlat1 -->
"ccedil", // "ç" -- latin small letter c with cedilla,
// U+00E7 ISOlat1 -->
"egrave", // "è" -- latin small letter e with grave,
// U+00E8 ISOlat1 -->
"eacute", // "é" -- latin small letter e with acute,
// U+00E9 ISOlat1 -->
"ecirc", // "ê" -- latin small letter e with circumflex,
// U+00EA ISOlat1 -->
"euml", // "ë" -- latin small letter e with diaeresis,
// U+00EB ISOlat1 -->
"igrave", // "ì" -- latin small letter i with grave,
// U+00EC ISOlat1 -->
"iacute", // "í" -- latin small letter i with acute,
// U+00ED ISOlat1 -->
"icirc", // "î" -- latin small letter i with circumflex,
// U+00EE ISOlat1 -->
"iuml", // "ï" -- latin small letter i with diaeresis,
// U+00EF ISOlat1 -->
"eth", // "ð" -- latin small letter eth, U+00F0 ISOlat1 -->
"ntilde", // "ñ" -- latin small letter n with tilde,
// U+00F1 ISOlat1 -->
"ograve", // "ò" -- latin small letter o with grave,
// U+00F2 ISOlat1 -->
"oacute", // "ó" -- latin small letter o with acute,
// U+00F3 ISOlat1 -->
"ocirc", // "ô" -- latin small letter o with circumflex,
// U+00F4 ISOlat1 -->
"otilde", // "õ" -- latin small letter o with tilde,
// U+00F5 ISOlat1 -->
"ouml", // "ö" -- latin small letter o with diaeresis,
// U+00F6 ISOlat1 -->
"divide", // "÷" -- division sign, U+00F7 ISOnum -->
"oslash", // "ø" -- latin small letter o with stroke,
// = latin small letter o slash,
// U+00F8 ISOlat1 -->
"ugrave", // "ù" -- latin small letter u with grave,
// U+00F9 ISOlat1 -->
"uacute", // "ú" -- latin small letter u with acute,
// U+00FA ISOlat1 -->
"ucirc", // "û" -- latin small letter u with circumflex,
// U+00FB ISOlat1 -->
"uuml", // "ü" -- latin small letter u with diaeresis,
// U+00FC ISOlat1 -->
"yacute", // "ý" -- latin small letter y with acute,
// U+00FD ISOlat1 -->
"thorn", // "þ" -- latin small letter thorn,
// U+00FE ISOlat1 -->
"yuml" // "ÿ" -- latin small letter y with diaeresis,
// U+00FF ISOlat1 -->
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
// you may not use this file except in compliance with the License. You may obtain a copy of the
// License at
// Software distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the License for the specific language governing rights and limitations under the License.
// The Original Code is: all this file.
// The Initial Developer of the Original Code is Michael H. Kay.
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
// Contributor(s): none.