Package org.apache.abdera.ext.html

Source Code of org.apache.abdera.ext.html.HtmlCleaner$VoidElementFixHtmlSerializer

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.  For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.abdera.ext.html;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.Arrays;

import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.sax.HtmlSerializer;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class HtmlCleaner {

    private HtmlCleaner() {
    }

    public static String parse(String value) {
        return parse(new StringReader(value), true);
    }

    public static String parse(InputStream in) {
        return parse(in, "UTF-8");
    }

    public static String parse(InputStream in, String charset) {
        try {
            return parse(new InputStreamReader(in, charset), true);
        } catch (RuntimeException e) {
            throw e;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static String parse(Reader in, boolean fragment) {
        try {
            nu.validator.htmlparser.sax.HtmlParser htmlParser = new nu.validator.htmlparser.sax.HtmlParser();
            htmlParser.setBogusXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
            htmlParser.setMappingLangToXmlLang(true);
            htmlParser.setReportingDoctype(false);
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            Writer w = new OutputStreamWriter(out, "UTF-8");
            HtmlSerializer ser = new VoidElementFixHtmlSerializer(w);
            htmlParser.setContentHandler(ser);
            htmlParser.setLexicalHandler(ser);
            if (!fragment)
                htmlParser.parse(new InputSource(in));
            else
                htmlParser.parseFragment(new InputSource(in), "div");
            try {
                w.flush();
            } catch (IOException e) {
            }
            return new String(out.toByteArray(), "UTF-8");
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e.getMessage());
        }
    }

    private static class VoidElementFixHtmlSerializer extends HtmlSerializer {
        private static final String[] VOID_ELEMENTS =
            {"area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr", "img", "input", "link",
             "meta", "param", "spacer", "wbr"};
        private final Writer writer;

        public VoidElementFixHtmlSerializer(Writer out) {
            super(out);
            this.writer = out;
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            if (Arrays.binarySearch(VOID_ELEMENTS, localName) > -1) {
                try {
                    writer.write('<');
                    writer.write('/');
                    writer.write(localName);
                    writer.write('>');
                } catch (IOException e) {
                    throw new SAXException(e);
                }
            }
            super.endElement(uri, localName, name);
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            StringBuilder buf = new StringBuilder();
            for (int n = start; n < (start + length); n++) {
                if (ch[n] == '<')
                    buf.append("&lt;");
                else if (ch[n] == '>')
                    buf.append("&gt;");
                else if (ch[n] == '&') {
                    boolean isentity = false;
                    int i = n;
                    String ent = null;
                    for (; i < (start + length); i++) {
                        if (ch[i] == ';') {
                            ent = new String(ch, n, i - n + 1);
                            isentity = ent.matches("\\&[\\w]*\\;");
                            break;
                        }
                    }
                    if (isentity) {
                        buf.append(ent);
                        n = i;
                    } else {
                        buf.append("&amp;");
                    }
                } else
                    buf.append(ch[n]);
            }
            super.characters(buf.toString().toCharArray(), 0, buf.length());
        }
    }
}
TOP

Related Classes of org.apache.abdera.ext.html.HtmlCleaner$VoidElementFixHtmlSerializer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.