/**
*
*/
package org.ryu22e.nico2cal.util;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* 文字列中のHTMLのタグを除去するユーティリティクラス。
* @author ryu22e
*
*/
public final class HtmlRemoveUtil {
/**
* コンストラクタ。
*/
private HtmlRemoveUtil() {
throw new AssertionError("Can not call this constructor.");
}
/**
* 文字列中のHTMLタグを除去する。
* @param html HTMLタグを含む文字列
* @return HTMLタグを除去された文字列
* @throws IOException
* @throws SAXException
*/
public static String removeHtml(String html) throws SAXException,
IOException {
if (html == null) {
return null;
}
DOMFragmentParser parser = new DOMFragmentParser();
// フィルターの設定
ElementRemover remover = new ElementRemover();
XMLDocumentFilter[] filters = { remover };
parser.setProperty(
"http://cyberneko.org/html/properties/filters",
filters);
HTMLDocument document = new HTMLDocumentImpl();
DocumentFragment fragment = document.createDocumentFragment();
InputSource inputSource = new InputSource(new StringReader(html));
parser.parse(inputSource, fragment);
StringWriter writer = new StringWriter();
OutputFormat format = new OutputFormat();
format.setOmitXMLDeclaration(true);
XMLSerializer serializer = new XMLSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(format);
serializer.serialize(fragment);
return writer.getBuffer().toString();
}
}