/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.abdera.ext.html;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.Arrays;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.sax.HtmlSerializer;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class HtmlCleaner {
private HtmlCleaner() {
}
public static String parse(String value) {
return parse(new StringReader(value), true);
}
public static String parse(InputStream in) {
return parse(in, "UTF-8");
}
public static String parse(InputStream in, String charset) {
try {
return parse(new InputStreamReader(in, charset), true);
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static String parse(Reader in, boolean fragment) {
try {
nu.validator.htmlparser.sax.HtmlParser htmlParser = new nu.validator.htmlparser.sax.HtmlParser();
htmlParser.setBogusXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
htmlParser.setMappingLangToXmlLang(true);
htmlParser.setReportingDoctype(false);
ByteArrayOutputStream out = new ByteArrayOutputStream();
Writer w = new OutputStreamWriter(out, "UTF-8");
HtmlSerializer ser = new VoidElementFixHtmlSerializer(w);
htmlParser.setContentHandler(ser);
htmlParser.setLexicalHandler(ser);
if (!fragment)
htmlParser.parse(new InputSource(in));
else
htmlParser.parseFragment(new InputSource(in), "div");
try {
w.flush();
} catch (IOException e) {
}
return new String(out.toByteArray(), "UTF-8");
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e.getMessage());
}
}
private static class VoidElementFixHtmlSerializer extends HtmlSerializer {
private static final String[] VOID_ELEMENTS =
{"area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr", "img", "input", "link",
"meta", "param", "spacer", "wbr"};
private final Writer writer;
public VoidElementFixHtmlSerializer(Writer out) {
super(out);
this.writer = out;
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
try {
writer.write('<');
writer.write('/');
writer.write(localName);
writer.write('>');
} catch (IOException e) {
throw new SAXException(e);
}
}
super.endElement(uri, localName, name);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
StringBuilder buf = new StringBuilder();
for (int n = start; n < (start + length); n++) {
if (ch[n] == '<')
buf.append("<");
else if (ch[n] == '>')
buf.append(">");
else if (ch[n] == '&') {
boolean isentity = false;
int i = n;
String ent = null;
for (; i < (start + length); i++) {
if (ch[i] == ';') {
ent = new String(ch, n, i - n + 1);
isentity = ent.matches("\\&[\\w]*\\;");
break;
}
}
if (isentity) {
buf.append(ent);
n = i;
} else {
buf.append("&");
}
} else
buf.append(ch[n]);
}
super.characters(buf.toString().toCharArray(), 0, buf.length());
}
}
}