Source Code of com.google.caja.lexer.escaping.UriUtil

// Copyright (C) 2009 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.google.caja.lexer.escaping;


import com.google.caja.SomethingWidgyHappenedError;
import com.google.caja.util.Join;
import com.google.caja.util.Strings;
import com.google.common.collect.Lists;


import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Utilities for dealing with URIs.
 *
 * @author mikesamuel@gmail.com
 */
public class UriUtil {


  /** Matches a URI extracting bits with different escaping conventions. */
  private static final Pattern RFC_3986 = Pattern.compile(
      // Derived from RFC 3986 Appendix B
      //   1                 2          3             4            5
      "^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\\?([^#]*))?(?:#(.*))?$",
      Pattern.DOTALL);


  /**
   * Convert a URI to a string, %xx escaping some codepoints that are in the
   * RFC3986 reserved set, but not in contexts where they are significant.
   * This works around problems with inconsistencies in escaping conventions
   * in CSS URIs, but still allows us to make sure that URIs don't look like
   * code that a badly written error recovery routine might jump into.
   * <p>
   * This escaping of codepoints is allowed by section 2.4.2 of RFC 2396:
   * <blockquote>
   *   In some cases, data that could be represented by an unreserved
   *   character may appear escaped; for example, some of the unreserved
   *   "mark" characters are automatically escaped by some systems.  If the
   *   given URI scheme defines a canonicalization algorithm, then
   *   unreserved characters may be unescaped according to that algorithm.
   *   For example, "%7e" is sometimes used instead of "~" in an http URL
   *   path, but the two are equivalent for an http URL.
   * </blockquote>
   *
   * @param uri a non-opaque URI.
   */
  public static String normalizeUri(String uri) throws URISyntaxException {
    uri = normalizeSpecialCharacters(uri);


    // We don't use java.net.URI to recompose the URI because of problems with
    // encoding in the multi-argument constructor as described at
    // http://blog.limewire.org/?p=261:
    //     And it seems that the multi-arg constructors, which do URL encoding
    //     for you, do NOT provide a way for you to encode these characters -
    //     which means you can only ever use them for their reserved (unescaped)
    //     purpose.
    //     For example, suppose I want to produce this URL:
    //       http://foo.com/bar?a=b&c=jon%26doe
    //       uri = new URI("http", null, "foo.com", -1, "/bar",
    //                     "a=b&c=jon%26doe", null);
    //       uri.toASCIIString() -> http://foo.com/bar?a=b&c=jon%2526doe
    //     ...
    //     The upshot of all of this is that I claim the multi-arg constructors
    //     are unusable, unless you restrict your URLs to never use reserved
    //     characters as values. In our use case, we can't do that because we
    //     don't control what URIs are incoming / outgoing.
    Matcher m = RFC_3986.matcher(uri);
    // The RFC_2396 matches all strings.
    m.matches();
    String scheme = m.group(1);
    String authority = m.group(2);
    String path = m.group(3);
    String query = m.group(4);
    String fragment = m.group(5);


    // Path must start with / if the path is not empty or there is an authority
    // or scheme.
    // Remove unnecessary .. components in path.


    StringBuilder sb = new StringBuilder(uri.length());
    if (scheme != null) {
      normalizeScheme(scheme, sb);
      sb.append(':');
    }
    if (authority != null) {
      if ("".equals(authority) && !Strings.eqIgnoreCase("file", scheme)) {
        throw new URISyntaxException(uri, "Blank authority");
      }
      sb.append("//");
      normalizeAuthority(authority, sb);
    } else if (scheme != null
               && !(Strings.eqIgnoreCase("file", scheme)
                    || isOpaque(scheme))) {
      throw new URISyntaxException(uri, "Missing authority");
    }
    if (path.length() != 0 || sb.length() != 0) {
      normalizePath(path, sb.length() != 0 && !isOpaque(scheme), sb);
    }
    if (query != null) {
      sb.append('?');
      normalizeQuery(query, sb);
    }
    if (fragment != null) {
      sb.append('#');
      normalizeFragment(fragment, sb);
    }
    return sb.toString();
  }


  /**
   * Browsers typically treat non-Latin variants of URI special characters as
   * special so that Chinese users can copy and paste URLs rendered using
   * full-width characters.
   */
  private static String normalizeSpecialCharacters(String uri) {
    StringBuilder sb = null;
    int pos = 0;
    int n = uri.length();
    for (int i = n; --i >= 0;) {
      char subst;
      // The mapping below was derived by running the below.
      // We want to use a standard list so that we don't miss code-points on
      // older Java versions.
      // Map<String, List<String>> m = Maps.newLinkedHashMap();
      // m.put(":", new ArrayList<String>());
      // m.put("/", new ArrayList<String>());
      // m.put("?", new ArrayList<String>());
      // m.put("#", new ArrayList<String>());
      // m.put("=", new ArrayList<String>());
      // m.put("&", new ArrayList<String>());
      // m.put(".", new ArrayList<String>());
      // StringBuilder sb = new StringBuilder();
      // for (int i = 0; i < Character.MAX_CODE_POINT; ++i) {
      //   sb.setLength(0);
      //   sb.appendCodePoint(i);
      //   String abnormal = sb.toString();
      //   String normal = Normalizer.normalize(abnormal, Normalizer.Form.NFKD);
      //   List<String> abnormalForms = m.get(normal);
      //   if (abnormalForms != null && !abnormal.equals(normal)) {
      //     abnormalForms.add(abnormal);
      //   }
      // }
      // for (Map.Entry<String, List<String>> e : m.entrySet()) {
      //   System.out.println(e.getKey());
      //   for (String s : e.getValue()) {
      //     System.out.println("\t0x" + Integer.toString(s.codePointAt(0), 16)
      //                        + " : " + s);
      //   }
      // }
      switch (uri.charAt(i)) {
        case 0xfe13: case 0xfe55: case 0xff1a:               subst = ':'; break;
        case 0xff0f:                                         subst = '/'; break;
        case 0xfe16: case 0xfe56: case 0xff1f:               subst = '?'; break;
        case 0xfe5f: case 0xff03:                            subst = '#'; break;
        case 0x207c: case 0x208c: case 0xfe66: case 0xff1d : subst = '='; break;
        case 0xfe60: case 0xff06:                            subst = '&'; break;
        case 0x2024: case 0xfe52: case 0xff0e:               subst = '.'; break;
        default: continue;
      }
      if (sb == null) { sb = new StringBuilder(n); }
      sb.append(uri, pos, i).append(subst);
      pos = i + 1;
    }
    return sb == null ? uri : sb.toString();
  }


  public static URI resolve(URI base, String relative)
      throws URISyntaxException {
    URI abs = base.resolve(normalizeUri(relative));
    if (!abs.isOpaque()) {
      String path = abs.getPath();
      // Workaround a bug in java.net.URI.
      // TODO(mikesamuel): stop using java.net.URI and use a decent URL
      // implementation instead.
      if (path != null && (path.startsWith("/../") || path.equals("/.."))) {
        return null;
      }
    }
    return abs;
  }


  public static String encode(String part) {
    for (int i = 0, n = part.length(); i < n; ++i) {
      char ch = part.charAt(i);
      if (ch >= 0x80 || Escaping.URI_ESCAPES.getEscape(ch) != null) {
        StringBuilder out = new StringBuilder(n + (n >> 2));
        Escaping.escapeUri(part, i, out);
        return out.toString();
      }
    }
    return part;
  }


  private static void normalizeScheme(String scheme, StringBuilder out) {
    // Section 3.1:
    // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
    int pos = 0, n = scheme.length();
    for (int i = 0; i < n; ++i) {
      char ch = scheme.charAt(i);
      if (ch == '%') {
        if (isInvalidEsc(scheme, i)) {
          out.append(scheme, pos, i).append("%25");
          pos = i + 1;
        }
      } else if ('A' <= ch && ch <= 'Z') {
        // From Section 3.1.
        // Although schemes are case-insensitive, the canonical form is
        // lowercase and documents that specify schemes must do so with
        // lowercase letters.
        // ...
        // An implementation should accept uppercase letters as equivalent to
        // lowercase in scheme names (e.g., allow "HTTP" as well as "http") for
        // the sake of robustness but should only produce lowercase scheme names
        // for consistency.
        out.append(scheme, pos, i).append((char) (ch | 32));
        pos = i + 1;
      } else if (!(('a' <= ch && ch <= 'z')
                 || ('0' <= ch && ch <= '9')
                 || ch == '+' || ch == '-' || ch == '.')) {
        out.append(scheme, pos, i);
        pos = i + 1;
        pctEncode(ch, out);
      }
    }
    out.append(scheme, pos, n);
  }


  private static boolean isOpaque(String scheme) {
    return Strings.eqIgnoreCase("mailto", scheme)
        || Strings.eqIgnoreCase("javascript", scheme)
        || Strings.eqIgnoreCase("content", scheme)
        || Strings.eqIgnoreCase("data", scheme)
        || Strings.eqIgnoreCase("clsid", scheme);
  }


  private static void normalizeAuthority(String authority, StringBuilder out)
      throws URISyntaxException {
    // Section 3.2:
    // The authority component is preceded by a double slash "//" and is
    // terminated by the next slash "/", question-mark "?", or by the end of
    // the URI.  Within the authority component, the characters ";", ":",
    // "@", "?", and "/" are reserved.


    // The above quote from the RFC ignores the fact that '#' ends the authority
    // in URI references, but the pattern from Appendix B recognizes this.
    // We escape '@' since it is a significant character in CSS in @import
    // directives, as well as in conditional compilation comments.
    // This does not affect web applications in practice, since browsers
    // disallow '@' in authorities in HTTP and HTTPS since only a tiny number of
    // HTTP servers recognized it, but it was widely used by phishers.
    int pos = 0, n = authority.length();
    for (int i = 0; i < n; ++i) {
      char ch = authority.charAt(i);
      if (ch == '%') {
        if (isInvalidEsc(authority, i)) {
          out.append(authority, pos, i).append("%25");
          pos = i + 1;
        }
      } else if (ch == ':') {
        // We assume a subset of server-based URIs that only recognizes
        //     host[:port]
        // intentionally ignoring registry-based authorities and the user
        // portion of server-based URIs.
        for (int j = i + 1; j < n; ++j) {
          ch = authority.charAt(j);
          if (!('0' <= ch && ch <= '9')) {
            throw new URISyntaxException(
                authority, "Bad port " + authority.substring(i + 1), j);
          }
        }
        break;
      } else if (!(('a' <= ch && ch <= 'z')
                   || ('A' <= ch && ch <= 'Z')
                   || ('0' <= ch && ch <= '9')
                   // Escapes ; and @.
                   || ch == '-' || ch == '+' || ch == '.')) {
        out.append(authority, pos, i);
        pos = i + 1;
        pctEncode(ch, out);
      }
    }
    out.append(authority, pos, n);
  }


  private static void normalizePath(
      String path, boolean requireAbsPath, StringBuilder out) {
    String normPath = normalizeEscapesInPath(path);
    boolean isAbs = requireAbsPath;
    if (normPath.startsWith("/")) {
      normPath = normPath.substring(1);
      isAbs = true;
    }
    List<String> pathParts = Lists.newArrayList(normPath.split("/"));
    int i = 0;
    while (i < pathParts.size()) {
      String dottedPart = pathParts.get(i).replace("%2e", ".")
          .replace("%2E", ".");
      if (".".equals(dottedPart)) {
        pathParts.remove(i);
      } else if ("..".equals(dottedPart)) {
        if (i > 0 && !"..".equals(pathParts.get(i - 1))) {
          // back up over the previous part which will soon contain the next
          // part to process.
          --i;
          pathParts.subList(i, i + 2).clear();
        } else if (isAbs) {  // can't get to parent of the root.
          pathParts.remove(i);
        } else {
          // normalize so the "..".equals(pathParts.get(i - 1)) check above
          // works.
          pathParts.set(i, "..");
          ++i;
        }
      } else {
        ++i;
      }
    }
    if (isAbs) {
      // All paths for absolute URIs must start with '/'.
      // The URL http://foo?bar is not strictly legal.
      out.append('/');
    }
    Join.join(out, "/", pathParts);
  }


  private static String normalizeEscapesInPath(String path) {
    // Section 3.3
    // The path may consist of a sequence of path segments separated by a
    // single slash "/" character.  Within a path segment, the characters
    // "/", ";", "=", and "?" are reserved.  Each path segment may include a
    // sequence of parameters, indicated by the semicolon ";" character.
    // The parameters are not significant to the parsing of relative
    // references.


    // The assertion that '?' is reserved in a path segment is inconsistent with
    // the grammar in that same section.
    // Parameters to path segments are not widely used, and ';' is a CSS special
    // character and a natural target for a badly written error recovery scheme.
    StringBuilder sb = new StringBuilder();
    // escape all but [\w\d.:+-]
    int pos = 0, n = path.length();
    for (int i = 0; i < n; ++i) {
      char ch = path.charAt(i);
      if (ch == '%') {
        if (isInvalidEsc(path, i)) {
          sb.append(path, pos, i).append("%25");
          pos = i + 1;
        }
      } else if (!(('a' <= ch && ch <= 'z')
                   || ('A' <= ch && ch <= 'Z')
                   || ('0' <= ch && ch <= '9')
                   // Escapes ';' and '='
                   || ch == ':' || ch == '-' || ch == '+'
                   || ch == '.' || ch == '/' || ch == ',' || ch == '$')) {
        sb.append(path, pos, i);
        pos = i + 1;
        pctEncode(ch, sb);
      }
    }
    sb.append(path, pos, n);
    return sb.toString();
  }


  private static void normalizeQuery(String query, StringBuilder out) {
    // 3.4. Query Component
    // The query component is a string of information to be interpreted by
    // the resource.
    // Within a query component, the characters ";", "/", "?", ":", "@",
    // "&", "=", "+", ",", and "$" are reserved.


    // We preserve '&', '=', and the initial '?' but escape most others that are
    // CSS or JS special characters including ';', ':', and '@' for reasons
    // described above.


    int pos = 0, n = query.length();
    for (int i = 0; i < n; ++i) {
      char ch = query.charAt(i);
      if (ch == '%') {
        if (isInvalidEsc(query, i)) {
          out.append(query, pos, i).append("%25");
          pos = i + 1;
        }
      } else if (!(('a' <= ch && ch <= 'z')
                   || ('A' <= ch && ch <= 'Z')
                   || ('0' <= ch && ch <= '9')
                   // Escapes ';', ':' and '@'
                   || ch == '-' || ch == '+' || ch == '.' || ch == '='
                   || ch == '&' || ch == ',')) {
        out.append(query, pos, i);
        pos = i + 1;
        pctEncode(ch, out);
      }
    }
    out.append(query, pos, n);
  }


  private static void normalizeFragment(String fragment, StringBuilder out) {
    // Section 4.1
    // fragment      = *uric


    int pos = 0, n = fragment.length();
    for (int i = 0; i < n; ++i) {
      char ch = fragment.charAt(i);
      if (ch == '%') {
        if (isInvalidEsc(fragment, i)) {
          out.append(fragment, pos, i).append("%25");
          pos = i + 1;
        }
      } else if (!(('a' <= ch && ch <= 'z')
                   || ('A' <= ch && ch <= 'Z')
                   || ('0' <= ch && ch <= '9')
                   || ch == '-' || ch == '+' || ch == '.')) {
        out.append(fragment, pos, i);
        pos = i + 1;
        pctEncode(ch, out);
      }
    }
    out.append(fragment, pos, n);
  }


  private static boolean isInvalidEsc(String uriPart, int pctIdx) {
    return pctIdx + 2 >= uriPart.length()
        || !isHexDigit(uriPart.charAt(pctIdx + 1))
        || !isHexDigit(uriPart.charAt(pctIdx + 2));
  }


  private static boolean isHexDigit(char ch) {
    return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f')
        || ('A' <= ch && ch <= 'F');
  }


  private static void pctEncode(char ch, StringBuilder out) {
    try {
      Escaping.pctEncode(ch, out);
    } catch (IOException ex) {
      throw new SomethingWidgyHappenedError(
          "StringBuilders shouldn't throw IOException", ex);
    }
  }


  private UriUtil() { /* not instantiable */ }
}
Source Code of com.google.caja.lexer.escaping.UriUtil

Related Classes of com.google.caja.lexer.escaping.UriUtil