Package com.dotmarketing.util.diff.helper

Source Code of com.dotmarketing.util.diff.helper.NekoHtmlParser

/*
* Copyright 2004 Guy Van den Broeck
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dotmarketing.util.diff.helper;

import java.io.IOException;

import com.dotcms.repackage.org.cyberneko.html.parsers.SAXParser;
import com.dotcms.repackage.org.xml.sax.Attributes;
import com.dotcms.repackage.org.xml.sax.ContentHandler;
import com.dotcms.repackage.org.xml.sax.InputSource;
import com.dotcms.repackage.org.xml.sax.Locator;
import com.dotcms.repackage.org.xml.sax.SAXException;
import com.dotcms.repackage.org.xml.sax.helpers.AttributesImpl;

/**
* Parses HTML files using the Neko HTML parser. Puts all elements and attribute
* names to lowercase, removes all namespaces, produces well-formed XML.
*/
public class NekoHtmlParser {

    public SaxBuffer parse(InputSource is) throws IOException, SAXException {
        SaxBuffer buffer = new SaxBuffer();
        parse(is, buffer);

        return buffer;
    }

    public void parse(InputSource is, ContentHandler consumer)
            throws IOException, SAXException {
        if (is == null)
            throw new NullPointerException("is argument is required.");

        SAXParser parser = new SAXParser();
        parser.setFeature("http://xml.org/sax/features/namespaces", true);
        parser
                .setFeature(
                        "http://cyberneko.org/html/features/override-namespaces",
                        false);
        parser.setFeature(
                "http://cyberneko.org/html/features/insert-namespaces", false);
        parser
                .setFeature(
                        "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
                        true);
        parser.setProperty(
                "http://cyberneko.org/html/properties/default-encoding",
                "UTF-8");
        parser.setProperty("http://cyberneko.org/html/properties/names/elems",
                "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs",
                "lower");

        parser.setContentHandler(new RemoveNamespacesHandler(
                new MergeCharacterEventsHandler(consumer)));
        parser.parse(is);
    }

    /**
     * A ContentHandler that drops all namespace information.
     */
    static class RemoveNamespacesHandler implements ContentHandler {
        private ContentHandler consumer;

        public RemoveNamespacesHandler(ContentHandler consumer) {
            this.consumer = consumer;
        }

        public void endDocument() throws SAXException {
            consumer.endDocument();
        }

        public void startDocument() throws SAXException {
            consumer.startDocument();
        }

        public void characters(char ch[], int start, int length)
                throws SAXException {
            consumer.characters(ch, start, length);
        }

        public void ignorableWhitespace(char ch[], int start, int length)
                throws SAXException {
            consumer.ignorableWhitespace(ch, start, length);
        }

        public void endPrefixMapping(String prefix) throws SAXException {
            // dropped on purpose
        }

        public void skippedEntity(String name) throws SAXException {
            // dropped on purpose
        }

        public void setDocumentLocator(Locator locator) {
            consumer.setDocumentLocator(locator);
        }

        public void processingInstruction(String target, String data)
                throws SAXException {
            // dropped on purpose
        }

        public void startPrefixMapping(String prefix, String uri)
                throws SAXException {
            // dropped on purpose
        }

        public void endElement(String namespaceURI, String localName,
                String qName) throws SAXException {
            consumer.endElement("", localName, localName);
        }

        public void startElement(String namespaceURI, String localName,
                String qName, Attributes atts) throws SAXException {
            AttributesImpl newAtts = new AttributesImpl(atts);
            for (int i = 0; i < atts.getLength(); i++) {
                newAtts.setURI(i, "");
                newAtts.setQName(i, newAtts.getLocalName(i));
            }
            consumer.startElement("", localName, localName, atts);
        }
    }

}
TOP

Related Classes of com.dotmarketing.util.diff.helper.NekoHtmlParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.