Package org.jboss.aesh.extensions.text.highlight.scanner

Source Code of org.jboss.aesh.extensions.text.highlight.scanner.HTMLScanner

/*
* Copyright 2012 Red Hat, Inc. and/or its affiliates.
*
* Licensed under the Eclipse Public License version 1.0, available at
* http://www.eclipse.org/legal/epl-v10.html
*/
package org.jboss.aesh.extensions.text.highlight.scanner;

import org.jboss.aesh.extensions.text.highlight.Encoder;
import org.jboss.aesh.extensions.text.highlight.Options;
import org.jboss.aesh.extensions.text.highlight.Scanner;
import org.jboss.aesh.extensions.text.highlight.StringScanner;
import org.jboss.aesh.extensions.text.highlight.Syntax;
import org.jboss.aesh.extensions.text.highlight.TokenType;
import org.jboss.aesh.extensions.text.highlight.WordList;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;

/*
* Based on https://github.com/rubychan/coderay/blob/master/lib/coderay/scanners/html.rb
* Last update sha: 8c3c0c49a98eb8daceb69d0b233d054fbbccc49e
*/
public class HTMLScanner implements Scanner {

    public static final String[] EVENT_ATTRIBUTES = new String[]{
            "onabort", "onafterprint", "onbeforeprint", "onbeforeunload", "onblur", "oncanplay",
            "oncanplaythrough", "onchange", "onclick", "oncontextmenu", "oncuechange", "ondblclick",
            "ondrag", "ondragdrop", "ondragend", "ondragenter", "ondragleave", "ondragover",
            "ondragstart", "ondrop", "ondurationchange", "onemptied", "onended", "onerror", "onfocus",
            "onformchange", "onforminput", "onhashchange", "oninput", "oninvalid", "onkeydown",
            "onkeypress", "onkeyup", "onload", "onloadeddata", "onloadedmetadata", "onloadstart",
            "onmessage", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup",
            "onmousewheel", "onmove", "onoffline", "ononline", "onpagehide", "onpageshow", "onpause",
            "onplay", "onplaying", "onpopstate", "onprogress", "onratechange", "onreadystatechange",
            "onredo", "onreset", "onresize", "onscroll", "onseeked", "onseeking", "onselect", "onshow",
            "onstalled", "onstorage", "onsubmit", "onsuspend", "ontimeupdate", "onundo", "onunload",
            "onvolumechange", "onwaiting"};

    public static final Pattern ATTR_NAME = Pattern.compile("[\\w.:-]+");
    public static final Pattern TAG_END = Pattern.compile("\\/?>");
    public static final Pattern HEX = Pattern.compile("[0-9a-fA-F]");
    public static final Pattern ENTITY = Pattern.compile("&(?:\\w+|\\#(?:\\d+|x" + HEX.pattern() + "+));");

    public static final Pattern SPACE = Pattern.compile("\\s+", Pattern.DOTALL);
    public static final Pattern CDATA_START = Pattern.compile("<!\\[CDATA\\[");
    public static final Pattern CDATA_END = Pattern.compile(".*?\\]\\]>", Pattern.DOTALL);
    public static final Pattern CDATA_ERROR = Pattern.compile(".+");
    public static final Pattern COMMENT = Pattern.compile("<!--(?:.*?-->|.*)", Pattern.DOTALL);
    public static final Pattern DOCTYPE = Pattern.compile("<!(\\w+)(?:.*?>|.*)|\\]>", Pattern.DOTALL);
    public static final Pattern PRE_PROCESSOR = Pattern.compile("<\\?xml(?:.*?\\?>|.*)");
    public static final Pattern COMMENT2 = Pattern.compile("<\\?(?:.*?\\?>|.*)", Pattern.DOTALL);
    public static final Pattern TAG = Pattern.compile("<\\/[-\\w.:]*>?", Pattern.DOTALL);
    public static final Pattern SPECIAL_TAG = Pattern.compile("<(?:(script|style)|[-\\w.:]+)(>)?", Pattern.DOTALL);
    public static final Pattern PLAIN = Pattern.compile("[^<>&]+");
    public static final Pattern ERROR = Pattern.compile("[<>&]");
    public static final Pattern EQUAL = Pattern.compile("=");
    public static final Pattern QUOTE = Pattern.compile("[\"']");
    public static final Pattern JAVASCRIPT_INLINE = Pattern.compile("javascript:[ \\t]*");
    public static final Pattern AMP = Pattern.compile("&");
    public static final Pattern END = Pattern.compile("[\\n>]");
    public static final Pattern SPECIAL_SPACE = Pattern.compile("[ \\t]*\\n");
    public static final Pattern SPECIAL_COMMENT = Pattern.compile("(\\s*<!--)(?:(.*?)(-->)|(.*))", Pattern.DOTALL);

    public static final Map<String, Pattern> PLAIN_STRING_CONTENT = new HashMap<String, Pattern>();

    {
        PLAIN_STRING_CONTENT.put("'", Pattern.compile("[^&'>\\n]+"));
        PLAIN_STRING_CONTENT.put("\"", Pattern.compile("[^&\">\\n]+"));
    }

    public enum EmbeddedType {
        script,
        style
    }

    public enum State {
        initial,
        in_special_tag,
        attribute,
        attribute_equal,
        attribute_value,
        attribute_value_string
    }

    public static final WordList<EmbeddedType> IN_ATTRIBUTE = new WordList<EmbeddedType>(null, true)
            .add(EVENT_ATTRIBUTES, EmbeddedType.script)
            .add(new String[]{"style"}, EmbeddedType.style);

    public static final Type TYPE = new Type("HTML", "\\.(html|htm|xhtml)$");

    @Override
    public Type getType() {
        return TYPE;
    }

    @Override
    public void scan(StringScanner source, Encoder encoder, Map<String, Object> options) {
        State state = State.initial;
        EmbeddedType in_attribute = null;
        String in_tag = null;
        Pattern plain_string_content = null;

        while (source.hasMore()) {
            MatchResult m = null;

            if (state != State.in_special_tag && (m = source.scan(SPACE)) != null) {
                encoder.textToken(m.group(), TokenType.space);
            }
            else {

                switch (state) {
                    case initial:

                        if ((m = source.scan(CDATA_START)) != null) {
                            encoder.textToken(m.group(), TokenType.inline_delimiter);
                            if ((m = source.scan(CDATA_END)) != null) {
                                encoder.textToken(m.group().substring(0, m.group().length() - 3), TokenType.plain);
                                encoder.textToken("]]>", TokenType.inline_delimiter);
                            }
                            else if ((m = source.scan(CDATA_ERROR)) != null) {
                                encoder.textToken(m.group(), TokenType.error);
                            }
                        }
                        else if ((m = source.scan(COMMENT)) != null) {
                            encoder.textToken(m.group(), TokenType.comment);
                        }
                        else if ((m = source.scan(DOCTYPE)) != null) {
                            encoder.textToken(m.group(), TokenType.doctype);
                        }
                        else if ((m = source.scan(PRE_PROCESSOR)) != null) {
                            encoder.textToken(m.group(), TokenType.preprocessor);
                        }
                        else if ((m = source.scan(COMMENT2)) != null) {
                            encoder.textToken(m.group(), TokenType.comment);
                        }
                        else if ((m = source.scan(TAG)) != null) {
                            in_tag = null;
                            encoder.textToken(m.group(), TokenType.tag);
                        }
                        else if ((m = source.scan(SPECIAL_TAG)) != null) {
                            encoder.textToken(m.group(), TokenType.tag);
                            in_tag = m.group(1);
                            if (m.group(2) != null) {
                                if (in_tag != null) {
                                    state = State.in_special_tag;
                                }
                            }
                            else {
                                state = State.attribute;
                            }
                        }
                        else if ((m = source.scan(PLAIN)) != null) {
                            encoder.textToken(m.group(), TokenType.plain);
                        }
                        else if ((m = source.scan(ENTITY)) != null) {
                            encoder.textToken(m.group(), TokenType.entity);
                        }
                        else if ((m = source.scan(ERROR)) != null) {
                            in_tag = null;
                            encoder.textToken(m.group(), TokenType.error);
                        }
                        else {
                            throw new RuntimeException("[BUG] else-case reached with state " + state + " in " + getClass());
                        }

                        break;
                    case attribute:

                        if ((m = source.scan(TAG_END)) != null) {
                            encoder.textToken(m.group(), TokenType.tag);
                            in_attribute = null;
                            if (in_tag != null) {
                                state = State.in_special_tag;
                            }
                            else {
                                state = State.initial;
                            }
                        }
                        else if ((m = source.scan(ATTR_NAME)) != null) {
                            in_attribute = IN_ATTRIBUTE.lookup(m.group());
                            encoder.textToken(m.group(), TokenType.attribute_name);
                            state = State.attribute_equal;
                        }
                        else {
                            in_tag = null;
                            encoder.textToken(source.next(), TokenType.error);
                        }

                        break;
                    case attribute_equal:

                        if ((m = source.scan(EQUAL)) != null) {
                            encoder.textToken(m.group(), TokenType.operator);
                            state = State.attribute_value;
                        }
                        else {
                            state = State.attribute;
                            break;
                        }

                    case attribute_value:
                        if ((m = source.scan(ATTR_NAME)) != null) {
                            encoder.textToken(m.group(), TokenType.attribute_value);
                            state = State.attribute;
                        }
                        else if ((m = source.scan(QUOTE)) != null) {
                            if (EmbeddedType.script == in_attribute || EmbeddedType.style == in_attribute) {
                                encoder.beginGroup(TokenType.string);
                                encoder.textToken(m.group(), TokenType.delimiter);
                                String groupStart = m.group();

                                if ((m = source.scan(JAVASCRIPT_INLINE)) != null) {
                                    encoder.textToken(m.group(), TokenType.comment);
                                }
                                String code = source.scanUntil(Pattern.compile("(?=" + groupStart + "|\\z)")).group();
                                if (EmbeddedType.script == in_attribute) {
                                    Syntax.Builder.create()
                                            .scannerType(JavaScriptScanner.TYPE.getName())
                                            .encoder(encoder)
                                            .execute(code);
                                }
                                else {
                                    Syntax.Builder.create()
                                            .scannerType(CSSScanner.TYPE.getName())
                                            .encoder(encoder)
                                            .scannerOptions(
                                                    Options.create()
                                                            .add(CSSScanner.OPTION_START_STATE, CSSScanner.State.block)
                                            )
                                            .execute(code);
                                }
                                m = source.scan(QUOTE);
                                if (m != null) {
                                    encoder.textToken(m.group(), TokenType.delimiter);
                                }
                                encoder.endGroup(TokenType.string);
                                state = State.attribute;
                                in_attribute = null;
                            }
                            else {
                                encoder.beginGroup(TokenType.string);
                                state = State.attribute_value_string;
                                plain_string_content = PLAIN_STRING_CONTENT.get(m.group());
                                encoder.textToken(m.group(), TokenType.delimiter);
                            }
                        }
                        else if ((m = source.scan(TAG_END)) != null) {
                            encoder.textToken(m.group(), TokenType.tag);
                            state = State.initial;
                        }
                        else {
                            encoder.textToken(source.next(), TokenType.error);
                        }
                        break;
                    case attribute_value_string:

                        if ((m = source.scan(plain_string_content)) != null) {
                            encoder.textToken(m.group(), TokenType.content);
                        }
                        else if ((m = source.scan(QUOTE)) != null) {
                            encoder.textToken(m.group(), TokenType.delimiter);
                            encoder.endGroup(TokenType.string);
                            state = State.attribute;
                        }
                        else if ((m = source.scan(ENTITY)) != null) {
                            encoder.textToken(m.group(), TokenType.entity);
                        }
                        else if ((m = source.scan(AMP)) != null) {
                            encoder.textToken(m.group(), TokenType.content);
                        }
                        else if ((m = source.scan(END)) != null) {
                            encoder.endGroup(TokenType.string);
                            state = State.initial;
                            encoder.textToken(m.group(), TokenType.error);
                        }
                        break;
                    case in_special_tag:

                        if ("script".equalsIgnoreCase(in_tag) || "style".equalsIgnoreCase(in_tag)) {
                            String code = null;
                            String closing = null;
                            if ((m = source.scan(SPECIAL_SPACE)) != null) {
                                encoder.textToken(m.group(), TokenType.space);
                            }
                            if ((m = source.scan(SPECIAL_COMMENT)) != null) {
                                code = m.group(2);
                                if (code == null) {
                                    code = m.group(4);
                                }
                                closing = m.group(3);
                                encoder.textToken(m.group(1), TokenType.comment);
                            }
                            else {
                                code = source.scanUntil("(?=(?:\\n\\s*)?<\\/" + in_tag + ">)|\\z").group();
                                closing = null;
                            }
                            if (code != null && !code.isEmpty()) {
                                encoder.beginGroup(TokenType.inline);
                                if ("script".equalsIgnoreCase(in_tag)) {
                                    Syntax.Builder.create()
                                            .scannerType(JavaScriptScanner.TYPE.getName())
                                            .encoder(encoder)
                                            .execute(code);
                                }
                                else {
                                    Syntax.Builder.create()
                                            .scannerType(CSSScanner.TYPE.getName())
                                            .encoder(encoder)
                                            .execute(code);
                                }
                                encoder.endGroup(TokenType.inline);
                            }
                            if (closing != null) {
                                encoder.textToken(closing, TokenType.comment);
                            }
                            state = State.initial;
                        }
                        else {
                            throw new RuntimeException("unknown special tag " + in_tag);
                        }
                        break;
                    default:
                        throw new RuntimeException("Unknown state " + state);
                }
            }
        }
    }

}
TOP

Related Classes of org.jboss.aesh.extensions.text.highlight.scanner.HTMLScanner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.