Source Code of org.pdf4j.saxon.event.HTMLURIEscaper

package org.pdf4j.saxon.event;
import org.pdf4j.saxon.charcode.UnicodeCharacterSet;
import org.pdf4j.saxon.codenorm.Normalizer;
import org.pdf4j.saxon.om.FastStringBuffer;
import org.pdf4j.saxon.om.NamePool;
import org.pdf4j.saxon.trans.XPathException;


import java.util.HashMap;


/**
  * This class is used as a filter on the serialization pipeline; it performs the function
  * of escaping URI-valued attributes in HTML
  * @author Michael H. Kay
  */


public class HTMLURIEscaper extends ProxyReceiver {


    /**
    * Table of attributes whose value is a URL
    */


    // we use two HashMaps to avoid unnecessary string concatenations


    private static HTMLTagHashSet urlAttributes = new HTMLTagHashSet(47);
    private static HTMLTagHashSet urlCombinations = new HTMLTagHashSet(101);


    static {
        setUrlAttribute("form", "action");
        setUrlAttribute("object", "archive");
        setUrlAttribute("body", "background");
        setUrlAttribute("q", "cite");
        setUrlAttribute("blockquote", "cite");
        setUrlAttribute("del", "cite");
        setUrlAttribute("ins", "cite");
        setUrlAttribute("object", "classid");
        setUrlAttribute("object", "codebase");
        setUrlAttribute("applet", "codebase");
        setUrlAttribute("object", "data");
        setUrlAttribute("button", "datasrc");
        setUrlAttribute("div", "datasrc");
        setUrlAttribute("input", "datasrc");
        setUrlAttribute("object", "datasrc");
        setUrlAttribute("select", "datasrc");
        setUrlAttribute("span", "datasrc");
        setUrlAttribute("table", "datasrc");
        setUrlAttribute("textarea", "datasrc");
        setUrlAttribute("script", "for");
        setUrlAttribute("a", "href");
        setUrlAttribute("a", "name");       // see second note in section B.2.1 of HTML 4 specification
        setUrlAttribute("area", "href");
        setUrlAttribute("link", "href");
        setUrlAttribute("base", "href");
        setUrlAttribute("img", "longdesc");
        setUrlAttribute("frame", "longdesc");
        setUrlAttribute("iframe", "longdesc");
        setUrlAttribute("head", "profile");
        setUrlAttribute("script", "src");
        setUrlAttribute("input", "src");
        setUrlAttribute("frame", "src");
        setUrlAttribute("iframe", "src");
        setUrlAttribute("img", "src");
        setUrlAttribute("img", "usemap");
        setUrlAttribute("input", "usemap");
        setUrlAttribute("object", "usemap");
    }


    private static void setUrlAttribute(String element, String attribute) {
        urlAttributes.add(attribute);
        urlCombinations.add(element + '+' + attribute);
    }


    private HashMap urlAttributeCache = new HashMap(30);


    public boolean isUrlAttribute(int element, int attribute) {
        Long key = new Long(((long)element)<<32 | (long)attribute);
        Boolean result = (Boolean)urlAttributeCache.get(key);
        if (result != null) {
            return result.booleanValue();
        }
        if (pool == null) {
            pool = getNamePool();
        }
        String attributeName = pool.getDisplayName(attribute);
        if (!urlAttributes.contains(attributeName)) {
            urlAttributeCache.put(key, Boolean.FALSE);
            return false;
        }
        String elementName = pool.getDisplayName(element);
        boolean b = urlCombinations.contains(elementName + '+' + attributeName);
        urlAttributeCache.put(key, Boolean.valueOf(b));
        return b;
    }


    protected int currentElement;
    protected boolean escapeURIAttributes = true;
    protected NamePool pool;


     /**
     * Start of a document node.
     */


    public void startDocument(int properties) throws XPathException {
        nextReceiver.startDocument(properties);
        pool = getPipelineConfiguration().getConfiguration().getNamePool();
    }


    /**
     * Notify the start of an element
     *
     * @param nameCode   integer code identifying the name of the element within the name pool.
     * @param typeCode   integer code identifying the element's type within the name pool.
     * @param properties properties of the element node
     */


    public void startElement(int nameCode, int typeCode, int locationId, int properties) throws XPathException {
        currentElement = nameCode;
        nextReceiver.startElement(nameCode, typeCode, locationId, properties);
    }


    /**
     * Notify an attribute. Attributes are notified after the startElement event, and before any
     * children. Namespaces and attributes may be intermingled.
     *
     * @param nameCode   The name of the attribute, as held in the name pool
     * @param typeCode   The type of the attribute, as held in the name pool
     * @param properties Bit significant value. The following bits are defined:
     *                   <dd>DISABLE_ESCAPING</dd>    <dt>Disable escaping for this attribute</dt>
     *                   <dd>NO_SPECIAL_CHARACTERS</dd>      <dt>Attribute value contains no special characters</dt>
     * @throws IllegalStateException: attempt to output an attribute when there is no open element
     *                                start tag
     */


    public void attribute(int nameCode, int typeCode, CharSequence value, int locationId, int properties) throws XPathException {
        if (escapeURIAttributes &&
               isUrlAttribute(currentElement, nameCode) &&  
               (properties & ReceiverOptions.DISABLE_ESCAPING) == 0) {
            nextReceiver.attribute(nameCode, typeCode, escapeURL(value, true), locationId,
                    properties | ReceiverOptions.DISABLE_CHARACTER_MAPS);
        } else {
            nextReceiver.attribute(nameCode, typeCode, value, locationId, properties);
        }
    }


    /**
     * Escape a URI according to the HTML rules: that is, a non-ASCII character (specifically,
     * a character outside the range 32 - 126) is replaced by the %HH encoding of the octets in
     * its UTF-8 representation
     * @param url the URI to be escaped
     * @param normalize
     * @return the URI after escaping non-ASCII characters
     */


    public static CharSequence escapeURL(CharSequence url, boolean normalize) {
        // optimize for the common case where the string is all ASCII characters
        for (int i=url.length()-1; i>=0; i--) {
            char ch = url.charAt(i);
            if (ch<32 || ch>126) {
                if (normalize) {
                    CharSequence normalized = new Normalizer(Normalizer.C).normalize(url);
                    return reallyEscapeURL(normalized);
                } else {
                    return reallyEscapeURL(url);
                }
            }
        }
        return url;
    }


    private static CharSequence reallyEscapeURL(CharSequence url) {
        FastStringBuffer sb = new FastStringBuffer(url.length() + 20);
        final String hex = "0123456789ABCDEF";
        byte[] array = new byte[4];


        for (int i=0; i<url.length(); i++) {
            char ch = url.charAt(i);
            if (ch<32 || ch>126) {
                int used = UnicodeCharacterSet.getUTF8Encoding(ch,
                                                 (i+1 < url.length() ? url.charAt(i+1): ' '), array);
                for (int b=0; b<used; b++) {
                    //int v = (array[b]>=0 ? array[b] : 256 + array[b]);
                    int v = ((int)array[b]) & 0xff;
                    sb.append('%');
                    sb.append(hex.charAt(v/16));
                    sb.append(hex.charAt(v%16));
                }


            } else {
                sb.append(ch);
            }
        }
        return sb;
    }
}


//
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
// you may not use this file except in compliance with the License. You may obtain a copy of the
// License at http://www.mozilla.org/MPL/
//
// Software distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the License for the specific language governing rights and limitations under the License.
//
// The Original Code is: all this file.
//
// The Initial Developer of the Original Code is Michael H. Kay.
//
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
//
// Contributor(s): none.
//
Source Code of org.pdf4j.saxon.event.HTMLURIEscaper

Related Classes of org.pdf4j.saxon.event.HTMLURIEscaper