/*
* Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen
* Copyright (c) 2006 Henri Sivonen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.xml;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Arrays;
import nu.validator.io.NcrEscapingWindows1252OutputStreamWriter;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Serializes a sequence of SAX events representing an XHTML 1.0 Strict document
* to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict
* document. The SAX events must represent a valid XHTML 1.0 document, except
* the namespace prefixes don't matter and there may be
* <code>startElement</code> and <code>endElement</code> calls for elements
* from other namespaces. The <code>startElement</code> and
* <code>endElement</code> calls for non-XHTML elements are ignored. No
* validity checking is performed. Hence, the emitter of the SAX events is
* responsible for making sure the events represent a document that meets the
* above requirements. The <code>OutputStream</code> is closed when the end of
* the document is seen.
*
* @version $Id$
* @author hsivonen
* @author taavi
*/
public class HtmlSerializer implements ContentHandler {
public final static int NO_DOCTYPE = 0;
public final static int DOCTYPE_HTML401_TRANSITIONAL = 1;
public final static int DOCTYPE_HTML401_STRICT = 2;
public final static int DOCTYPE_HTML5 = 3;
/**
* The XHTML namespace URI
*/
private final static String XHTML_NS = "http://www.w3.org/1999/xhtml";
/**
* HTML 4.01 elements which don't have an end tag
*/
private static final String[] emptyElements = { "area", "base", "basefont",
"br", "col", "command", "frame", "hr", "img", "input", "isindex",
"link", "meta", "param" };
/**
* Minimized "boolean" HTML attributes
*/
private static final String[] booleanAttributes = { "active", "async",
"autofocus", "autosubmit", "checked", "compact", "declare",
"default", "defer", "disabled", "ismap", "multiple", "nohref",
"noresize", "noshade", "nowrap", "readonly", "required", "selected" };
/**
* The writer used for output
*/
protected Writer writer;
private int doctype;
private String encoding;
private boolean emitMeta;
/**
* Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode
* with the UTF-8 encoding and no charset meta.
*
* @param out
* the stream to which the output is written
*/
public HtmlSerializer(OutputStream out) {
this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8");
}
public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) {
this(out, doctype, emitMeta, "UTF-8");
}
public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta,
String enc) {
this.emitMeta = emitMeta;
if (doctype < 0 || doctype > 3) {
throw new IllegalArgumentException("Bad doctype constant.");
}
this.doctype = doctype;
if ("UTF-8".equalsIgnoreCase(enc)) {
try {
this.encoding = "UTF-8";
this.writer = new OutputStreamWriter(out, "UTF-8");
} catch (UnsupportedEncodingException uee) {
throw new RuntimeException("UTF-8 not supported", uee);
}
} else if ("Windows-1252".equalsIgnoreCase(enc)) {
this.encoding = "Windows-1252";
this.writer = new NcrEscapingWindows1252OutputStreamWriter(out);
} else {
throw new IllegalArgumentException(
"Encoding must be UTF-8 or Windows-1252.");
}
}
/**
* Writes out characters.
*
* @param ch
* the source array
* @param start
* the index of the first character to be written
* @param length
* the number of characters to write
*
* @throws SAXException
* if there are IO problems
*/
public void characters(char[] ch, int start, int length)
throws SAXException {
try {
for (int j = 0; j < length; j++) {
char c = ch[start + j];
switch (c) {
case '<':
this.writer.write("<");
break;
case '>':
this.writer.write(">");
break;
case '&':
this.writer.write("&");
break;
default:
this.writer.write(c);
}
}
} catch (IOException ioe) {
throw (SAXException)new SAXException(ioe).initCause(ioe);
}
}
/**
* Must be called in the end.
*
* @throws SAXException
* if there are IO problems
*/
public void endDocument() throws SAXException {
try {
this.writer.close();
} catch (IOException ioe) {
throw (SAXException)new SAXException(ioe).initCause(ioe);
}
}
/**
* Writes an end tag if the element is an XHTML element and is not an empty
* element in HTML 4.01 Strict.
*
* @param namespaceURI
* the XML namespace
* @param localName
* the element name in the namespace
* @param qName
* ignored
*
* @throws SAXException
* if there are IO problems
*/
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
try {
if (XHTML_NS.equals(namespaceURI)
&& Arrays.binarySearch(emptyElements, localName) < 0) {
this.writer.write("</");
this.writer.write(localName);
this.writer.write('>');
}
} catch (IOException ioe) {
throw (SAXException)new SAXException(ioe).initCause(ioe);
}
}
/**
* Must be called first.
*/
public void startDocument() throws SAXException {
try {
switch (doctype) {
case NO_DOCTYPE:
return;
case DOCTYPE_HTML5:
writer.write("<!DOCTYPE html>\n");
return;
case DOCTYPE_HTML401_STRICT:
writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n");
return;
case DOCTYPE_HTML401_TRANSITIONAL:
writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n");
return;
}
} catch (IOException ioe) {
throw (SAXException)new SAXException(ioe).initCause(ioe);
}
}
/**
* Writes a start tag if the element is an XHTML element.
*
* @param namespaceURI
* the XML namespace
* @param localName
* the element name in the namespace
* @param qName
* ignored
* @param atts
* the attribute list
*
* @throws SAXException
* if there are IO problems
*/
public void startElement(String namespaceURI, String localName,
String qName, Attributes atts) throws SAXException {
try {
if (XHTML_NS.equals(namespaceURI)) {
if ("meta".equals(localName)
&& ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex(
"", "httpequiv") != -1))) {
return;
}
// start and element name
this.writer.write('<');
this.writer.write(localName);
// attributes
int length = atts.getLength();
boolean langPrinted = false;
for (int i = 0; i < length; i++) {
String ns = atts.getURI(i);
String name = null;
if ("".equals(ns)) {
name = atts.getLocalName(i);
} else if ("http://www.w3.org/XML/1998/namespace".equals(ns)
&& "lang".equals(atts.getLocalName(i))) {
name = "lang";
}
if (name != null && !(langPrinted && "lang".equals(name))) {
this.writer.write(' ');
this.writer.write(name);
if ("lang".equals(name)) {
langPrinted = true;
}
if (Arrays.binarySearch(booleanAttributes, name) < 0) {
// write value, escape certain characters
this.writer.write("=\"");
String value = atts.getValue(i);
for (int j = 0; j < value.length(); j++) {
char c = value.charAt(j);
switch (c) {
case '<':
this.writer.write("<");
break;
case '>':
this.writer.write(">");
break;
case '&':
this.writer.write("&");
break;
case '"':
this.writer.write(""");
break;
default:
this.writer.write(c);
}
}
this.writer.write('"');
}
}
}
// close
this.writer.write('>');
if (emitMeta && "head".equals(localName)) {
this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=");
this.writer.write(encoding);
this.writer.write("\">");
}
}
} catch (IOException ioe) {
throw (SAXException)new SAXException(ioe).initCause(ioe);
}
}
/**
* Used for testing. Pass a file:// URL as the command line argument.
*/
public static void main(String[] args) {
try {
javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance();
fac.setNamespaceAware(true);
fac.setValidating(false);
XMLReader parser = fac.newSAXParser().getXMLReader();
parser.setContentHandler(new HtmlSerializer(System.out));
parser.parse(args[0]);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/** Does nothing. */
public void endPrefixMapping(String str) throws SAXException {
}
/** Does nothing. */
public void ignorableWhitespace(char[] values, int param, int param2)
throws SAXException {
}
/** Does nothing. */
public void processingInstruction(String str, String str1)
throws SAXException {
}
/** Does nothing. */
public void setDocumentLocator(Locator locator) {
}
/** Does nothing. */
public void skippedEntity(String str) throws SAXException {
}
/** Does nothing. */
public void startPrefixMapping(String str, String str1) throws SAXException {
}
}