Package org.ryu22e.nico2cal.util

Source Code of org.ryu22e.nico2cal.util.HtmlRemoveUtil

/**
*
*/
package org.ryu22e.nico2cal.util;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;

import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
* 文字列中のHTMLのタグを除去するユーティリティクラス。
* @author ryu22e
*
*/
public final class HtmlRemoveUtil {

    /**
     * コンストラクタ。
     */
    private HtmlRemoveUtil() {
        throw new AssertionError("Can not call this constructor.");
    }

    /**
     * 文字列中のHTMLタグを除去する。
     * @param html HTMLタグを含む文字列
     * @return HTMLタグを除去された文字列
     * @throws IOException
     * @throws SAXException
     */
    public static String removeHtml(String html) throws SAXException,
            IOException {
        if (html == null) {
            return null;
        }

        DOMFragmentParser parser = new DOMFragmentParser();

        // フィルターの設定
        ElementRemover remover = new ElementRemover();
        XMLDocumentFilter[] filters = { remover };
        parser.setProperty(
            "http://cyberneko.org/html/properties/filters",
            filters);
        HTMLDocument document = new HTMLDocumentImpl();
        DocumentFragment fragment = document.createDocumentFragment();

        InputSource inputSource = new InputSource(new StringReader(html));
        parser.parse(inputSource, fragment);
        StringWriter writer = new StringWriter();
        OutputFormat format = new OutputFormat();

        format.setOmitXMLDeclaration(true);
        XMLSerializer serializer = new XMLSerializer();
        serializer.setOutputCharStream(writer);
        serializer.setOutputFormat(format);
        serializer.serialize(fragment);

        return writer.getBuffer().toString();
    }

}
TOP

Related Classes of org.ryu22e.nico2cal.util.HtmlRemoveUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.